1 // Copyright (C) 2019 The Android Open Source Project
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14
15 #include <libsnapshot/snapshot.h>
16
17 #include <dirent.h>
18 #include <fcntl.h>
19 #include <math.h>
20 #include <sys/file.h>
21 #include <sys/types.h>
22 #include <sys/unistd.h>
23
24 #include <filesystem>
25 #include <optional>
26 #include <thread>
27 #include <unordered_set>
28
29 #include <android-base/file.h>
30 #include <android-base/logging.h>
31 #include <android-base/parseint.h>
32 #include <android-base/properties.h>
33 #include <android-base/strings.h>
34 #include <android-base/unique_fd.h>
35 #include <cutils/sockets.h>
36 #include <ext4_utils/ext4_utils.h>
37 #include <fs_mgr.h>
38 #include <fs_mgr/file_wait.h>
39 #include <fs_mgr_dm_linear.h>
40 #include <fstab/fstab.h>
41 #include <libdm/dm.h>
42 #include <libfiemap/image_manager.h>
43 #include <liblp/liblp.h>
44
45 #include <android/snapshot/snapshot.pb.h>
46 #include <libsnapshot/snapshot_stats.h>
47 #include "device_info.h"
48 #include "partition_cow_creator.h"
49 #include "snapshot_metadata_updater.h"
50 #include "snapshot_reader.h"
51 #include "utility.h"
52
53 namespace android {
54 namespace snapshot {
55
56 using aidl::android::hardware::boot::MergeStatus;
57 using android::base::unique_fd;
58 using android::dm::DeviceMapper;
59 using android::dm::DmDeviceState;
60 using android::dm::DmTable;
61 using android::dm::DmTargetLinear;
62 using android::dm::DmTargetSnapshot;
63 using android::dm::DmTargetUser;
64 using android::dm::kSectorSize;
65 using android::dm::SnapshotStorageMode;
66 using android::fiemap::FiemapStatus;
67 using android::fiemap::IImageManager;
68 using android::fs_mgr::CreateDmTable;
69 using android::fs_mgr::CreateLogicalPartition;
70 using android::fs_mgr::CreateLogicalPartitionParams;
71 using android::fs_mgr::GetPartitionGroupName;
72 using android::fs_mgr::GetPartitionName;
73 using android::fs_mgr::LpMetadata;
74 using android::fs_mgr::MetadataBuilder;
75 using android::fs_mgr::SlotNumberForSlotSuffix;
76 using chromeos_update_engine::DeltaArchiveManifest;
77 using chromeos_update_engine::Extent;
78 using chromeos_update_engine::FileDescriptor;
79 using chromeos_update_engine::PartitionUpdate;
80 template <typename T>
81 using RepeatedPtrField = google::protobuf::RepeatedPtrField<T>;
82 using std::chrono::duration_cast;
83 using namespace std::chrono_literals;
84 using namespace std::string_literals;
85
86 static constexpr char kBootIndicatorPath[] = "/metadata/ota/snapshot-boot";
87 static constexpr char kRollbackIndicatorPath[] = "/metadata/ota/rollback-indicator";
88 static constexpr auto kUpdateStateCheckInterval = 2s;
89
90 MergeFailureCode CheckMergeConsistency(const std::string& name, const SnapshotStatus& status);
91
92 // Note: IImageManager is an incomplete type in the header, so the default
93 // destructor doesn't work.
~SnapshotManager()94 SnapshotManager::~SnapshotManager() {}
95
New(IDeviceInfo * info)96 std::unique_ptr<SnapshotManager> SnapshotManager::New(IDeviceInfo* info) {
97 if (!info) {
98 info = new DeviceInfo();
99 }
100
101 return std::unique_ptr<SnapshotManager>(new SnapshotManager(info));
102 }
103
NewForFirstStageMount(IDeviceInfo * info)104 std::unique_ptr<SnapshotManager> SnapshotManager::NewForFirstStageMount(IDeviceInfo* info) {
105 if (!info) {
106 DeviceInfo* impl = new DeviceInfo();
107 impl->set_first_stage_init(true);
108 info = impl;
109 }
110 auto sm = New(info);
111
112 // The first-stage version of snapuserd is explicitly started by init. Do
113 // not attempt to using it during tests (which run in normal AOSP).
114 if (!sm->device()->IsTestDevice()) {
115 sm->use_first_stage_snapuserd_ = true;
116 }
117 return sm;
118 }
119
SnapshotManager(IDeviceInfo * device)120 SnapshotManager::SnapshotManager(IDeviceInfo* device)
121 : dm_(device->GetDeviceMapper()), device_(device), metadata_dir_(device_->GetMetadataDir()) {
122 merge_consistency_checker_ = android::snapshot::CheckMergeConsistency;
123 }
124
GetCowName(const std::string & snapshot_name)125 static std::string GetCowName(const std::string& snapshot_name) {
126 return snapshot_name + "-cow";
127 }
128
GetSnapshotDriver(LockedFile * lock)129 SnapshotManager::SnapshotDriver SnapshotManager::GetSnapshotDriver(LockedFile* lock) {
130 if (UpdateUsesUserSnapshots(lock)) {
131 return SnapshotManager::SnapshotDriver::DM_USER;
132 } else {
133 return SnapshotManager::SnapshotDriver::DM_SNAPSHOT;
134 }
135 }
136
GetDmUserCowName(const std::string & snapshot_name,SnapshotManager::SnapshotDriver driver)137 static std::string GetDmUserCowName(const std::string& snapshot_name,
138 SnapshotManager::SnapshotDriver driver) {
139 // dm-user block device will act as a snapshot device. We identify it with
140 // the same partition name so that when partitions can be mounted off
141 // dm-user.
142
143 switch (driver) {
144 case SnapshotManager::SnapshotDriver::DM_USER: {
145 return snapshot_name;
146 }
147
148 case SnapshotManager::SnapshotDriver::DM_SNAPSHOT: {
149 return snapshot_name + "-user-cow";
150 }
151
152 default: {
153 LOG(ERROR) << "Invalid snapshot driver";
154 return "";
155 }
156 }
157 }
158
GetCowImageDeviceName(const std::string & snapshot_name)159 static std::string GetCowImageDeviceName(const std::string& snapshot_name) {
160 return snapshot_name + "-cow-img";
161 }
162
GetBaseDeviceName(const std::string & partition_name)163 static std::string GetBaseDeviceName(const std::string& partition_name) {
164 return partition_name + "-base";
165 }
166
GetSourceDeviceName(const std::string & partition_name)167 static std::string GetSourceDeviceName(const std::string& partition_name) {
168 return partition_name + "-src";
169 }
170
BeginUpdate()171 bool SnapshotManager::BeginUpdate() {
172 bool needs_merge = false;
173 if (!TryCancelUpdate(&needs_merge)) {
174 return false;
175 }
176 if (needs_merge) {
177 LOG(INFO) << "Wait for merge (if any) before beginning a new update.";
178 auto state = ProcessUpdateState();
179 LOG(INFO) << "Merged with state = " << state;
180 }
181
182 auto file = LockExclusive();
183 if (!file) return false;
184
185 // Purge the ImageManager just in case there is a corrupt lp_metadata file
186 // lying around. (NB: no need to return false on an error, we can let the
187 // update try to progress.)
188 if (EnsureImageManager()) {
189 images_->RemoveAllImages();
190 }
191
192 // Clear any cached metadata (this allows re-using one manager across tests).
193 old_partition_metadata_ = nullptr;
194
195 auto state = ReadUpdateState(file.get());
196 if (state != UpdateState::None) {
197 LOG(ERROR) << "An update is already in progress, cannot begin a new update";
198 return false;
199 }
200 return WriteUpdateState(file.get(), UpdateState::Initiated);
201 }
202
CancelUpdate()203 bool SnapshotManager::CancelUpdate() {
204 bool needs_merge = false;
205 if (!TryCancelUpdate(&needs_merge)) {
206 return false;
207 }
208 if (needs_merge) {
209 LOG(ERROR) << "Cannot cancel update after it has completed or started merging";
210 }
211 return !needs_merge;
212 }
213
TryCancelUpdate(bool * needs_merge)214 bool SnapshotManager::TryCancelUpdate(bool* needs_merge) {
215 *needs_merge = false;
216
217 auto file = LockExclusive();
218 if (!file) return false;
219
220 UpdateState state = ReadUpdateState(file.get());
221 if (state == UpdateState::None) {
222 RemoveInvalidSnapshots(file.get());
223 return true;
224 }
225
226 if (state == UpdateState::Initiated) {
227 LOG(INFO) << "Update has been initiated, now canceling";
228 return RemoveAllUpdateState(file.get());
229 }
230
231 if (state == UpdateState::Unverified) {
232 // We completed an update, but it can still be canceled if we haven't booted into it.
233 auto slot = GetCurrentSlot();
234 if (slot != Slot::Target) {
235 LOG(INFO) << "Canceling previously completed updates (if any)";
236 return RemoveAllUpdateState(file.get());
237 }
238 }
239 *needs_merge = true;
240 return true;
241 }
242
ReadUpdateSourceSlotSuffix()243 std::string SnapshotManager::ReadUpdateSourceSlotSuffix() {
244 auto boot_file = GetSnapshotBootIndicatorPath();
245 std::string contents;
246 if (!android::base::ReadFileToString(boot_file, &contents)) {
247 PLOG(WARNING) << "Cannot read " << boot_file;
248 return {};
249 }
250 return contents;
251 }
252
GetCurrentSlot()253 SnapshotManager::Slot SnapshotManager::GetCurrentSlot() {
254 auto contents = ReadUpdateSourceSlotSuffix();
255 if (contents.empty()) {
256 return Slot::Unknown;
257 }
258 if (device_->GetSlotSuffix() == contents) {
259 return Slot::Source;
260 }
261 return Slot::Target;
262 }
263
GetSnapshotSlotSuffix()264 std::string SnapshotManager::GetSnapshotSlotSuffix() {
265 switch (GetCurrentSlot()) {
266 case Slot::Target:
267 return device_->GetSlotSuffix();
268 default:
269 return device_->GetOtherSlotSuffix();
270 }
271 }
272
RemoveFileIfExists(const std::string & path)273 static bool RemoveFileIfExists(const std::string& path) {
274 std::string message;
275 if (!android::base::RemoveFileIfExists(path, &message)) {
276 LOG(ERROR) << "Remove failed: " << path << ": " << message;
277 return false;
278 }
279 return true;
280 }
281
RemoveAllUpdateState(LockedFile * lock,const std::function<bool ()> & prolog)282 bool SnapshotManager::RemoveAllUpdateState(LockedFile* lock, const std::function<bool()>& prolog) {
283 if (prolog && !prolog()) {
284 LOG(WARNING) << "Can't RemoveAllUpdateState: prolog failed.";
285 return false;
286 }
287
288 LOG(INFO) << "Removing all update state.";
289
290 if (!RemoveAllSnapshots(lock)) {
291 LOG(ERROR) << "Could not remove all snapshots";
292 return false;
293 }
294
295 // It's okay if these fail:
296 // - For SnapshotBoot and Rollback, first-stage init performs a deeper check after
297 // reading the indicator file, so it's not a problem if it still exists
298 // after the update completes.
299 // - For ForwardMerge, FinishedSnapshotWrites asserts that the existence of the indicator
300 // matches the incoming update.
301 std::vector<std::string> files = {
302 GetSnapshotBootIndicatorPath(),
303 GetRollbackIndicatorPath(),
304 GetForwardMergeIndicatorPath(),
305 GetOldPartitionMetadataPath(),
306 };
307 for (const auto& file : files) {
308 RemoveFileIfExists(file);
309 }
310
311 // If this fails, we'll keep trying to remove the update state (as the
312 // device reboots or starts a new update) until it finally succeeds.
313 return WriteUpdateState(lock, UpdateState::None);
314 }
315
FinishedSnapshotWrites(bool wipe)316 bool SnapshotManager::FinishedSnapshotWrites(bool wipe) {
317 auto lock = LockExclusive();
318 if (!lock) return false;
319
320 auto update_state = ReadUpdateState(lock.get());
321 if (update_state == UpdateState::Unverified) {
322 LOG(INFO) << "FinishedSnapshotWrites already called before. Ignored.";
323 return true;
324 }
325
326 if (update_state != UpdateState::Initiated) {
327 LOG(ERROR) << "Can only transition to the Unverified state from the Initiated state.";
328 return false;
329 }
330
331 if (!EnsureNoOverflowSnapshot(lock.get())) {
332 LOG(ERROR) << "Cannot ensure there are no overflow snapshots.";
333 return false;
334 }
335
336 if (!UpdateForwardMergeIndicator(wipe)) {
337 return false;
338 }
339
340 // This file is written on boot to detect whether a rollback occurred. It
341 // MUST NOT exist before rebooting, otherwise, we're at risk of deleting
342 // snapshots too early.
343 if (!RemoveFileIfExists(GetRollbackIndicatorPath())) {
344 return false;
345 }
346
347 // This file acts as both a quick indicator for init (it can use access(2)
348 // to decide how to do first-stage mounts), and it stores the old slot, so
349 // we can tell whether or not we performed a rollback.
350 auto contents = device_->GetSlotSuffix();
351 auto boot_file = GetSnapshotBootIndicatorPath();
352 if (!WriteStringToFileAtomic(contents, boot_file)) {
353 PLOG(ERROR) << "write failed: " << boot_file;
354 return false;
355 }
356 return WriteUpdateState(lock.get(), UpdateState::Unverified);
357 }
358
CreateSnapshot(LockedFile * lock,PartitionCowCreator * cow_creator,SnapshotStatus * status)359 bool SnapshotManager::CreateSnapshot(LockedFile* lock, PartitionCowCreator* cow_creator,
360 SnapshotStatus* status) {
361 CHECK(lock);
362 CHECK(lock->lock_mode() == LOCK_EX);
363 CHECK(status);
364
365 if (status->name().empty()) {
366 LOG(ERROR) << "SnapshotStatus has no name.";
367 return false;
368 }
369 // Check these sizes. Like liblp, we guarantee the partition size is
370 // respected, which means it has to be sector-aligned. (This guarantee is
371 // useful for locating avb footers correctly). The COW file size, however,
372 // can be arbitrarily larger than specified, so we can safely round it up.
373 if (status->device_size() % kSectorSize != 0) {
374 LOG(ERROR) << "Snapshot " << status->name()
375 << " device size is not a multiple of the sector size: "
376 << status->device_size();
377 return false;
378 }
379 if (status->snapshot_size() % kSectorSize != 0) {
380 LOG(ERROR) << "Snapshot " << status->name()
381 << " snapshot size is not a multiple of the sector size: "
382 << status->snapshot_size();
383 return false;
384 }
385 if (status->cow_partition_size() % kSectorSize != 0) {
386 LOG(ERROR) << "Snapshot " << status->name()
387 << " cow partition size is not a multiple of the sector size: "
388 << status->cow_partition_size();
389 return false;
390 }
391 if (status->cow_file_size() % kSectorSize != 0) {
392 LOG(ERROR) << "Snapshot " << status->name()
393 << " cow file size is not a multiple of the sector size: "
394 << status->cow_file_size();
395 return false;
396 }
397
398 status->set_state(SnapshotState::CREATED);
399 status->set_sectors_allocated(0);
400 status->set_metadata_sectors(0);
401 status->set_using_snapuserd(cow_creator->using_snapuserd);
402 status->set_compression_algorithm(cow_creator->compression_algorithm);
403 if (cow_creator->enable_threading) {
404 status->set_enable_threading(cow_creator->enable_threading);
405 }
406 if (cow_creator->batched_writes) {
407 status->set_batched_writes(cow_creator->batched_writes);
408 }
409
410 if (!WriteSnapshotStatus(lock, *status)) {
411 PLOG(ERROR) << "Could not write snapshot status: " << status->name();
412 return false;
413 }
414 return true;
415 }
416
CreateCowImage(LockedFile * lock,const std::string & name)417 Return SnapshotManager::CreateCowImage(LockedFile* lock, const std::string& name) {
418 CHECK(lock);
419 CHECK(lock->lock_mode() == LOCK_EX);
420 if (!EnsureImageManager()) return Return::Error();
421
422 SnapshotStatus status;
423 if (!ReadSnapshotStatus(lock, name, &status)) {
424 return Return::Error();
425 }
426
427 // The COW file size should have been rounded up to the nearest sector in CreateSnapshot.
428 if (status.cow_file_size() % kSectorSize != 0) {
429 LOG(ERROR) << "Snapshot " << name << " COW file size is not a multiple of the sector size: "
430 << status.cow_file_size();
431 return Return::Error();
432 }
433
434 std::string cow_image_name = GetCowImageDeviceName(name);
435 int cow_flags = IImageManager::CREATE_IMAGE_DEFAULT;
436 return Return(images_->CreateBackingImage(cow_image_name, status.cow_file_size(), cow_flags));
437 }
438
MapDmUserCow(LockedFile * lock,const std::string & name,const std::string & cow_file,const std::string & base_device,const std::string & base_path_merge,const std::chrono::milliseconds & timeout_ms,std::string * path)439 bool SnapshotManager::MapDmUserCow(LockedFile* lock, const std::string& name,
440 const std::string& cow_file, const std::string& base_device,
441 const std::string& base_path_merge,
442 const std::chrono::milliseconds& timeout_ms, std::string* path) {
443 CHECK(lock);
444
445 if (UpdateUsesUserSnapshots(lock)) {
446 SnapshotStatus status;
447 if (!ReadSnapshotStatus(lock, name, &status)) {
448 LOG(ERROR) << "MapDmUserCow: ReadSnapshotStatus failed...";
449 return false;
450 }
451
452 if (status.state() == SnapshotState::NONE ||
453 status.state() == SnapshotState::MERGE_COMPLETED) {
454 LOG(ERROR) << "Should not create a snapshot device for " << name
455 << " after merging has completed.";
456 return false;
457 }
458
459 SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock);
460 if (update_status.state() == UpdateState::MergeCompleted ||
461 update_status.state() == UpdateState::MergeNeedsReboot) {
462 LOG(ERROR) << "Should not create a snapshot device for " << name
463 << " after global merging has completed.";
464 return false;
465 }
466 }
467
468 // Use an extra decoration for first-stage init, so we can transition
469 // to a new table entry in second-stage.
470 std::string misc_name = name;
471 if (use_first_stage_snapuserd_) {
472 misc_name += "-init";
473 }
474
475 if (!EnsureSnapuserdConnected()) {
476 return false;
477 }
478
479 uint64_t base_sectors = 0;
480 if (!UpdateUsesUserSnapshots(lock)) {
481 base_sectors = snapuserd_client_->InitDmUserCow(misc_name, cow_file, base_device);
482 if (base_sectors == 0) {
483 LOG(ERROR) << "Failed to retrieve base_sectors from Snapuserd";
484 return false;
485 }
486 } else {
487 // For userspace snapshots, the size of the base device is taken as the
488 // size of the dm-user block device. Since there is no pseudo mapping
489 // created in the daemon, we no longer need to rely on the daemon for
490 // sizing the dm-user block device.
491 unique_fd fd(TEMP_FAILURE_RETRY(open(base_path_merge.c_str(), O_RDONLY | O_CLOEXEC)));
492 if (fd < 0) {
493 LOG(ERROR) << "Cannot open block device: " << base_path_merge;
494 return false;
495 }
496
497 uint64_t dev_sz = get_block_device_size(fd.get());
498 if (!dev_sz) {
499 LOG(ERROR) << "Failed to find block device size: " << base_path_merge;
500 return false;
501 }
502
503 base_sectors = dev_sz >> 9;
504 }
505
506 DmTable table;
507 table.Emplace<DmTargetUser>(0, base_sectors, misc_name);
508 if (!dm_.CreateDevice(name, table, path, timeout_ms)) {
509 LOG(ERROR) << " dm-user: CreateDevice failed... ";
510 return false;
511 }
512 if (!WaitForDevice(*path, timeout_ms)) {
513 LOG(ERROR) << " dm-user: timeout: Failed to create block device for: " << name;
514 return false;
515 }
516
517 auto control_device = "/dev/dm-user/" + misc_name;
518 if (!WaitForDevice(control_device, timeout_ms)) {
519 return false;
520 }
521
522 if (UpdateUsesUserSnapshots(lock)) {
523 // Now that the dm-user device is created, initialize the daemon and
524 // spin up the worker threads.
525 if (!snapuserd_client_->InitDmUserCow(misc_name, cow_file, base_device, base_path_merge)) {
526 LOG(ERROR) << "InitDmUserCow failed";
527 return false;
528 }
529 }
530
531 return snapuserd_client_->AttachDmUser(misc_name);
532 }
533
MapSnapshot(LockedFile * lock,const std::string & name,const std::string & base_device,const std::string & cow_device,const std::chrono::milliseconds & timeout_ms,std::string * dev_path)534 bool SnapshotManager::MapSnapshot(LockedFile* lock, const std::string& name,
535 const std::string& base_device, const std::string& cow_device,
536 const std::chrono::milliseconds& timeout_ms,
537 std::string* dev_path) {
538 CHECK(lock);
539
540 SnapshotStatus status;
541 if (!ReadSnapshotStatus(lock, name, &status)) {
542 return false;
543 }
544 if (status.state() == SnapshotState::NONE || status.state() == SnapshotState::MERGE_COMPLETED) {
545 LOG(ERROR) << "Should not create a snapshot device for " << name
546 << " after merging has completed.";
547 return false;
548 }
549
550 // Validate the block device size, as well as the requested snapshot size.
551 // Note that during first-stage init, we don't have the device paths.
552 if (android::base::StartsWith(base_device, "/")) {
553 unique_fd fd(open(base_device.c_str(), O_RDONLY | O_CLOEXEC));
554 if (fd < 0) {
555 PLOG(ERROR) << "open failed: " << base_device;
556 return false;
557 }
558 auto dev_size = get_block_device_size(fd);
559 if (!dev_size) {
560 PLOG(ERROR) << "Could not determine block device size: " << base_device;
561 return false;
562 }
563 if (status.device_size() != dev_size) {
564 LOG(ERROR) << "Block device size for " << base_device << " does not match"
565 << "(expected " << status.device_size() << ", got " << dev_size << ")";
566 return false;
567 }
568 }
569 if (status.device_size() % kSectorSize != 0) {
570 LOG(ERROR) << "invalid blockdev size for " << base_device << ": " << status.device_size();
571 return false;
572 }
573 if (status.snapshot_size() % kSectorSize != 0 ||
574 status.snapshot_size() > status.device_size()) {
575 LOG(ERROR) << "Invalid snapshot size for " << base_device << ": " << status.snapshot_size();
576 return false;
577 }
578 if (status.device_size() != status.snapshot_size()) {
579 LOG(ERROR) << "Device size and snapshot size must be the same (device size = "
580 << status.device_size() << ", snapshot size = " << status.snapshot_size();
581 return false;
582 }
583
584 uint64_t snapshot_sectors = status.snapshot_size() / kSectorSize;
585
586 // Note that merging is a global state. We do track whether individual devices
587 // have completed merging, but the start of the merge process is considered
588 // atomic.
589 SnapshotStorageMode mode;
590 SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock);
591 switch (update_status.state()) {
592 case UpdateState::MergeCompleted:
593 case UpdateState::MergeNeedsReboot:
594 LOG(ERROR) << "Should not create a snapshot device for " << name
595 << " after global merging has completed.";
596 return false;
597 case UpdateState::Merging:
598 case UpdateState::MergeFailed:
599 // Note: MergeFailed indicates that a merge is in progress, but
600 // is possibly stalled. We still have to honor the merge.
601 if (DecideMergePhase(status) == update_status.merge_phase()) {
602 mode = SnapshotStorageMode::Merge;
603 } else {
604 mode = SnapshotStorageMode::Persistent;
605 }
606 break;
607 default:
608 mode = SnapshotStorageMode::Persistent;
609 break;
610 }
611
612 if (mode == SnapshotStorageMode::Persistent && status.state() == SnapshotState::MERGING) {
613 LOG(ERROR) << "Snapshot: " << name
614 << " has snapshot status Merging but mode set to Persistent."
615 << " Changing mode to Snapshot-Merge.";
616 mode = SnapshotStorageMode::Merge;
617 }
618
619 DmTable table;
620 table.Emplace<DmTargetSnapshot>(0, snapshot_sectors, base_device, cow_device, mode,
621 kSnapshotChunkSize);
622 if (!dm_.CreateDevice(name, table, dev_path, timeout_ms)) {
623 LOG(ERROR) << "Could not create snapshot device: " << name;
624 return false;
625 }
626 return true;
627 }
628
MapCowImage(const std::string & name,const std::chrono::milliseconds & timeout_ms)629 std::optional<std::string> SnapshotManager::MapCowImage(
630 const std::string& name, const std::chrono::milliseconds& timeout_ms) {
631 if (!EnsureImageManager()) return std::nullopt;
632 auto cow_image_name = GetCowImageDeviceName(name);
633
634 bool ok;
635 std::string cow_dev;
636 if (device_->IsRecovery() || device_->IsFirstStageInit()) {
637 const auto& opener = device_->GetPartitionOpener();
638 ok = images_->MapImageWithDeviceMapper(opener, cow_image_name, &cow_dev);
639 } else {
640 ok = images_->MapImageDevice(cow_image_name, timeout_ms, &cow_dev);
641 }
642
643 if (ok) {
644 LOG(INFO) << "Mapped " << cow_image_name << " to " << cow_dev;
645 return cow_dev;
646 }
647 LOG(ERROR) << "Could not map image device: " << cow_image_name;
648 return std::nullopt;
649 }
650
MapSourceDevice(LockedFile * lock,const std::string & name,const std::chrono::milliseconds & timeout_ms,std::string * path)651 bool SnapshotManager::MapSourceDevice(LockedFile* lock, const std::string& name,
652 const std::chrono::milliseconds& timeout_ms,
653 std::string* path) {
654 CHECK(lock);
655
656 auto metadata = ReadOldPartitionMetadata(lock);
657 if (!metadata) {
658 LOG(ERROR) << "Could not map source device due to missing or corrupt metadata";
659 return false;
660 }
661
662 auto old_name = GetOtherPartitionName(name);
663 auto slot_suffix = device_->GetSlotSuffix();
664 auto slot = SlotNumberForSlotSuffix(slot_suffix);
665
666 CreateLogicalPartitionParams params = {
667 .block_device = device_->GetSuperDevice(slot),
668 .metadata = metadata,
669 .partition_name = old_name,
670 .timeout_ms = timeout_ms,
671 .device_name = GetSourceDeviceName(name),
672 .partition_opener = &device_->GetPartitionOpener(),
673 };
674 if (!CreateLogicalPartition(std::move(params), path)) {
675 LOG(ERROR) << "Could not create source device for snapshot " << name;
676 return false;
677 }
678 return true;
679 }
680
UnmapSnapshot(LockedFile * lock,const std::string & name)681 bool SnapshotManager::UnmapSnapshot(LockedFile* lock, const std::string& name) {
682 CHECK(lock);
683
684 if (UpdateUsesUserSnapshots(lock)) {
685 if (!UnmapUserspaceSnapshotDevice(lock, name)) {
686 return false;
687 }
688 } else {
689 if (!DeleteDeviceIfExists(name)) {
690 LOG(ERROR) << "Could not delete snapshot device: " << name;
691 return false;
692 }
693 }
694 return true;
695 }
696
UnmapCowImage(const std::string & name)697 bool SnapshotManager::UnmapCowImage(const std::string& name) {
698 if (!EnsureImageManager()) return false;
699 return images_->UnmapImageIfExists(GetCowImageDeviceName(name));
700 }
701
DeleteSnapshot(LockedFile * lock,const std::string & name)702 bool SnapshotManager::DeleteSnapshot(LockedFile* lock, const std::string& name) {
703 CHECK(lock);
704 CHECK(lock->lock_mode() == LOCK_EX);
705 if (!EnsureImageManager()) return false;
706
707 if (!UnmapCowDevices(lock, name)) {
708 return false;
709 }
710
711 // We can't delete snapshots in recovery. The only way we'd try is it we're
712 // completing or canceling a merge in preparation for a data wipe, in which
713 // case, we don't care if the file sticks around.
714 if (device_->IsRecovery()) {
715 LOG(INFO) << "Skipping delete of snapshot " << name << " in recovery.";
716 return true;
717 }
718
719 auto cow_image_name = GetCowImageDeviceName(name);
720 if (images_->BackingImageExists(cow_image_name)) {
721 if (!images_->DeleteBackingImage(cow_image_name)) {
722 return false;
723 }
724 }
725
726 std::string error;
727 auto file_path = GetSnapshotStatusFilePath(name);
728 if (!android::base::RemoveFileIfExists(file_path, &error)) {
729 LOG(ERROR) << "Failed to remove status file " << file_path << ": " << error;
730 return false;
731 }
732 return true;
733 }
734
InitiateMerge()735 bool SnapshotManager::InitiateMerge() {
736 auto lock = LockExclusive();
737 if (!lock) return false;
738
739 UpdateState state = ReadUpdateState(lock.get());
740 if (state != UpdateState::Unverified) {
741 LOG(ERROR) << "Cannot begin a merge if an update has not been verified";
742 return false;
743 }
744
745 auto slot = GetCurrentSlot();
746 if (slot != Slot::Target) {
747 LOG(ERROR) << "Device cannot merge while not booting from new slot";
748 return false;
749 }
750
751 std::vector<std::string> snapshots;
752 if (!ListSnapshots(lock.get(), &snapshots)) {
753 LOG(ERROR) << "Could not list snapshots";
754 return false;
755 }
756
757 auto other_suffix = device_->GetOtherSlotSuffix();
758
759 for (const auto& snapshot : snapshots) {
760 if (android::base::EndsWith(snapshot, other_suffix)) {
761 // Allow the merge to continue, but log this unexpected case.
762 LOG(ERROR) << "Unexpected snapshot found during merge: " << snapshot;
763 continue;
764 }
765
766 // The device has to be mapped, since everything should be merged at
767 // the same time. This is a fairly serious error. We could forcefully
768 // map everything here, but it should have been mapped during first-
769 // stage init.
770 if (dm_.GetState(snapshot) == DmDeviceState::INVALID) {
771 LOG(ERROR) << "Cannot begin merge; device " << snapshot << " is not mapped.";
772 return false;
773 }
774 }
775
776 auto metadata = ReadCurrentMetadata();
777 for (auto it = snapshots.begin(); it != snapshots.end();) {
778 switch (GetMetadataPartitionState(*metadata, *it)) {
779 case MetadataPartitionState::Flashed:
780 LOG(WARNING) << "Detected re-flashing for partition " << *it
781 << ". Skip merging it.";
782 [[fallthrough]];
783 case MetadataPartitionState::None: {
784 LOG(WARNING) << "Deleting snapshot for partition " << *it;
785 if (!DeleteSnapshot(lock.get(), *it)) {
786 LOG(WARNING) << "Cannot delete snapshot for partition " << *it
787 << ". Skip merging it anyways.";
788 }
789 it = snapshots.erase(it);
790 } break;
791 case MetadataPartitionState::Updated: {
792 ++it;
793 } break;
794 }
795 }
796
797 bool using_snapuserd = false;
798
799 std::vector<std::string> first_merge_group;
800
801 DmTargetSnapshot::Status initial_target_values = {};
802 for (const auto& snapshot : snapshots) {
803 if (!UpdateUsesUserSnapshots(lock.get())) {
804 DmTargetSnapshot::Status current_status;
805 if (!QuerySnapshotStatus(snapshot, nullptr, ¤t_status)) {
806 return false;
807 }
808 initial_target_values.sectors_allocated += current_status.sectors_allocated;
809 initial_target_values.total_sectors += current_status.total_sectors;
810 initial_target_values.metadata_sectors += current_status.metadata_sectors;
811 }
812
813 SnapshotStatus snapshot_status;
814 if (!ReadSnapshotStatus(lock.get(), snapshot, &snapshot_status)) {
815 return false;
816 }
817
818 using_snapuserd |= snapshot_status.using_snapuserd();
819 if (DecideMergePhase(snapshot_status) == MergePhase::FIRST_PHASE) {
820 first_merge_group.emplace_back(snapshot);
821 }
822 }
823
824 SnapshotUpdateStatus initial_status = ReadSnapshotUpdateStatus(lock.get());
825 initial_status.set_state(UpdateState::Merging);
826 initial_status.set_using_snapuserd(using_snapuserd);
827
828 if (!UpdateUsesUserSnapshots(lock.get())) {
829 initial_status.set_sectors_allocated(initial_target_values.sectors_allocated);
830 initial_status.set_total_sectors(initial_target_values.total_sectors);
831 initial_status.set_metadata_sectors(initial_target_values.metadata_sectors);
832 }
833
834 // If any partitions shrunk, we need to merge them before we merge any other
835 // partitions (see b/177935716). Otherwise, a merge from another partition
836 // may overwrite the source block of a copy operation.
837 const std::vector<std::string>* merge_group;
838 if (first_merge_group.empty()) {
839 merge_group = &snapshots;
840 initial_status.set_merge_phase(MergePhase::SECOND_PHASE);
841 } else {
842 merge_group = &first_merge_group;
843 initial_status.set_merge_phase(MergePhase::FIRST_PHASE);
844 }
845
846 // Point of no return - mark that we're starting a merge. From now on every
847 // eligible snapshot must be a merge target.
848 if (!WriteSnapshotUpdateStatus(lock.get(), initial_status)) {
849 return false;
850 }
851
852 auto reported_code = MergeFailureCode::Ok;
853 for (const auto& snapshot : *merge_group) {
854 // If this fails, we have no choice but to continue. Everything must
855 // be merged. This is not an ideal state to be in, but it is safe,
856 // because we the next boot will try again.
857 auto code = SwitchSnapshotToMerge(lock.get(), snapshot);
858 if (code != MergeFailureCode::Ok) {
859 LOG(ERROR) << "Failed to switch snapshot to a merge target: " << snapshot;
860 if (reported_code == MergeFailureCode::Ok) {
861 reported_code = code;
862 }
863 }
864 }
865
866 // If we couldn't switch everything to a merge target, pre-emptively mark
867 // this merge as failed. It will get acknowledged when WaitForMerge() is
868 // called.
869 if (reported_code != MergeFailureCode::Ok) {
870 WriteUpdateState(lock.get(), UpdateState::MergeFailed, reported_code);
871 }
872
873 // Return true no matter what, because a merge was initiated.
874 return true;
875 }
876
SwitchSnapshotToMerge(LockedFile * lock,const std::string & name)877 MergeFailureCode SnapshotManager::SwitchSnapshotToMerge(LockedFile* lock, const std::string& name) {
878 SnapshotStatus status;
879 if (!ReadSnapshotStatus(lock, name, &status)) {
880 return MergeFailureCode::ReadStatus;
881 }
882 if (status.state() != SnapshotState::CREATED) {
883 LOG(WARNING) << "Snapshot " << name
884 << " has unexpected state: " << SnapshotState_Name(status.state());
885 }
886
887 if (UpdateUsesUserSnapshots(lock)) {
888 if (EnsureSnapuserdConnected()) {
889 // This is the point where we inform the daemon to initiate/resume
890 // the merge
891 if (!snapuserd_client_->InitiateMerge(name)) {
892 return MergeFailureCode::UnknownTable;
893 }
894 } else {
895 LOG(ERROR) << "Failed to connect to snapuserd daemon to initiate merge";
896 return MergeFailureCode::UnknownTable;
897 }
898 } else {
899 // After this, we return true because we technically did switch to a merge
900 // target. Everything else we do here is just informational.
901 if (auto code = RewriteSnapshotDeviceTable(name); code != MergeFailureCode::Ok) {
902 return code;
903 }
904 }
905
906 status.set_state(SnapshotState::MERGING);
907
908 if (!UpdateUsesUserSnapshots(lock)) {
909 DmTargetSnapshot::Status dm_status;
910 if (!QuerySnapshotStatus(name, nullptr, &dm_status)) {
911 LOG(ERROR) << "Could not query merge status for snapshot: " << name;
912 }
913 status.set_sectors_allocated(dm_status.sectors_allocated);
914 status.set_metadata_sectors(dm_status.metadata_sectors);
915 }
916
917 if (!WriteSnapshotStatus(lock, status)) {
918 LOG(ERROR) << "Could not update status file for snapshot: " << name;
919 }
920 return MergeFailureCode::Ok;
921 }
922
RewriteSnapshotDeviceTable(const std::string & name)923 MergeFailureCode SnapshotManager::RewriteSnapshotDeviceTable(const std::string& name) {
924 std::vector<DeviceMapper::TargetInfo> old_targets;
925 if (!dm_.GetTableInfo(name, &old_targets)) {
926 LOG(ERROR) << "Could not read snapshot device table: " << name;
927 return MergeFailureCode::GetTableInfo;
928 }
929 if (old_targets.size() != 1 || DeviceMapper::GetTargetType(old_targets[0].spec) != "snapshot") {
930 LOG(ERROR) << "Unexpected device-mapper table for snapshot: " << name;
931 return MergeFailureCode::UnknownTable;
932 }
933
934 std::string base_device, cow_device;
935 if (!DmTargetSnapshot::GetDevicesFromParams(old_targets[0].data, &base_device, &cow_device)) {
936 LOG(ERROR) << "Could not derive underlying devices for snapshot: " << name;
937 return MergeFailureCode::GetTableParams;
938 }
939
940 DmTable table;
941 table.Emplace<DmTargetSnapshot>(0, old_targets[0].spec.length, base_device, cow_device,
942 SnapshotStorageMode::Merge, kSnapshotChunkSize);
943 if (!dm_.LoadTableAndActivate(name, table)) {
944 LOG(ERROR) << "Could not swap device-mapper tables on snapshot device " << name;
945 return MergeFailureCode::ActivateNewTable;
946 }
947 LOG(INFO) << "Successfully switched snapshot device to a merge target: " << name;
948 return MergeFailureCode::Ok;
949 }
950
GetSingleTarget(const std::string & dm_name,TableQuery query,DeviceMapper::TargetInfo * target)951 bool SnapshotManager::GetSingleTarget(const std::string& dm_name, TableQuery query,
952 DeviceMapper::TargetInfo* target) {
953 if (dm_.GetState(dm_name) == DmDeviceState::INVALID) {
954 return false;
955 }
956
957 std::vector<DeviceMapper::TargetInfo> targets;
958 bool result;
959 if (query == TableQuery::Status) {
960 result = dm_.GetTableStatus(dm_name, &targets);
961 } else {
962 result = dm_.GetTableInfo(dm_name, &targets);
963 }
964 if (!result) {
965 LOG(ERROR) << "Could not query device: " << dm_name;
966 return false;
967 }
968 if (targets.size() != 1) {
969 return false;
970 }
971
972 *target = std::move(targets[0]);
973 return true;
974 }
975
IsSnapshotDevice(const std::string & dm_name,TargetInfo * target)976 bool SnapshotManager::IsSnapshotDevice(const std::string& dm_name, TargetInfo* target) {
977 DeviceMapper::TargetInfo snap_target;
978 if (!GetSingleTarget(dm_name, TableQuery::Status, &snap_target)) {
979 return false;
980 }
981 auto type = DeviceMapper::GetTargetType(snap_target.spec);
982
983 // If this is not a user-snapshot device then it should either
984 // be a dm-snapshot or dm-snapshot-merge target
985 if (type != "user") {
986 if (type != "snapshot" && type != "snapshot-merge") {
987 return false;
988 }
989 }
990
991 if (target) {
992 *target = std::move(snap_target);
993 }
994 return true;
995 }
996
UpdateStateToStr(const enum UpdateState state)997 auto SnapshotManager::UpdateStateToStr(const enum UpdateState state) {
998 switch (state) {
999 case None:
1000 return "None";
1001 case Initiated:
1002 return "Initiated";
1003 case Unverified:
1004 return "Unverified";
1005 case Merging:
1006 return "Merging";
1007 case MergeNeedsReboot:
1008 return "MergeNeedsReboot";
1009 case MergeCompleted:
1010 return "MergeCompleted";
1011 case MergeFailed:
1012 return "MergeFailed";
1013 case Cancelled:
1014 return "Cancelled";
1015 default:
1016 return "Unknown";
1017 }
1018 }
1019
QuerySnapshotStatus(const std::string & dm_name,std::string * target_type,DmTargetSnapshot::Status * status)1020 bool SnapshotManager::QuerySnapshotStatus(const std::string& dm_name, std::string* target_type,
1021 DmTargetSnapshot::Status* status) {
1022 DeviceMapper::TargetInfo target;
1023 if (!IsSnapshotDevice(dm_name, &target)) {
1024 LOG(ERROR) << "Device " << dm_name << " is not a snapshot or snapshot-merge device";
1025 return false;
1026 }
1027 if (!DmTargetSnapshot::ParseStatusText(target.data, status)) {
1028 LOG(ERROR) << "Could not parse snapshot status text: " << dm_name;
1029 return false;
1030 }
1031 if (target_type) {
1032 *target_type = DeviceMapper::GetTargetType(target.spec);
1033 }
1034 if (!status->error.empty()) {
1035 LOG(ERROR) << "Snapshot: " << dm_name << " returned error code: " << status->error;
1036 return false;
1037 }
1038 return true;
1039 }
1040
1041 // Note that when a merge fails, we will *always* try again to complete the
1042 // merge each time the device boots. There is no harm in doing so, and if
1043 // the problem was transient, we might manage to get a new outcome.
ProcessUpdateState(const std::function<bool ()> & callback,const std::function<bool ()> & before_cancel)1044 UpdateState SnapshotManager::ProcessUpdateState(const std::function<bool()>& callback,
1045 const std::function<bool()>& before_cancel) {
1046 while (true) {
1047 auto result = CheckMergeState(before_cancel);
1048 LOG(INFO) << "ProcessUpdateState handling state: " << UpdateStateToStr(result.state);
1049
1050 if (result.state == UpdateState::MergeFailed) {
1051 AcknowledgeMergeFailure(result.failure_code);
1052 }
1053 if (result.state != UpdateState::Merging) {
1054 // Either there is no merge, or the merge was finished, so no need
1055 // to keep waiting.
1056 return result.state;
1057 }
1058
1059 if (callback && !callback()) {
1060 return result.state;
1061 }
1062
1063 // This wait is not super time sensitive, so we have a relatively
1064 // low polling frequency.
1065 std::this_thread::sleep_for(kUpdateStateCheckInterval);
1066 }
1067 }
1068
CheckMergeState(const std::function<bool ()> & before_cancel)1069 auto SnapshotManager::CheckMergeState(const std::function<bool()>& before_cancel) -> MergeResult {
1070 auto lock = LockExclusive();
1071 if (!lock) {
1072 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::AcquireLock);
1073 }
1074
1075 auto result = CheckMergeState(lock.get(), before_cancel);
1076 LOG(INFO) << "CheckMergeState for snapshots returned: " << UpdateStateToStr(result.state);
1077
1078 if (result.state == UpdateState::MergeCompleted) {
1079 // Do this inside the same lock. Failures get acknowledged without the
1080 // lock, because flock() might have failed.
1081 AcknowledgeMergeSuccess(lock.get());
1082 } else if (result.state == UpdateState::Cancelled) {
1083 if (!device_->IsRecovery() && !RemoveAllUpdateState(lock.get(), before_cancel)) {
1084 LOG(ERROR) << "Failed to remove all update state after acknowleding cancelled update.";
1085 }
1086 }
1087 return result;
1088 }
1089
CheckMergeState(LockedFile * lock,const std::function<bool ()> & before_cancel)1090 auto SnapshotManager::CheckMergeState(LockedFile* lock, const std::function<bool()>& before_cancel)
1091 -> MergeResult {
1092 SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock);
1093 switch (update_status.state()) {
1094 case UpdateState::None:
1095 case UpdateState::MergeCompleted:
1096 // Harmless races are allowed between two callers of WaitForMerge,
1097 // so in both of these cases we just propagate the state.
1098 return MergeResult(update_status.state());
1099
1100 case UpdateState::Merging:
1101 case UpdateState::MergeNeedsReboot:
1102 case UpdateState::MergeFailed:
1103 // We'll poll each snapshot below. Note that for the NeedsReboot
1104 // case, we always poll once to give cleanup another opportunity to
1105 // run.
1106 break;
1107
1108 case UpdateState::Unverified:
1109 // This is an edge case. Normally cancelled updates are detected
1110 // via the merge poll below, but if we never started a merge, we
1111 // need to also check here.
1112 if (HandleCancelledUpdate(lock, before_cancel)) {
1113 return MergeResult(UpdateState::Cancelled);
1114 }
1115 return MergeResult(update_status.state());
1116
1117 default:
1118 return MergeResult(update_status.state());
1119 }
1120
1121 std::vector<std::string> snapshots;
1122 if (!ListSnapshots(lock, &snapshots)) {
1123 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::ListSnapshots);
1124 }
1125
1126 auto other_suffix = device_->GetOtherSlotSuffix();
1127
1128 bool cancelled = false;
1129 bool merging = false;
1130 bool needs_reboot = false;
1131 bool wrong_phase = false;
1132 MergeFailureCode failure_code = MergeFailureCode::Ok;
1133 for (const auto& snapshot : snapshots) {
1134 if (android::base::EndsWith(snapshot, other_suffix)) {
1135 // This will have triggered an error message in InitiateMerge already.
1136 LOG(INFO) << "Skipping merge validation of unexpected snapshot: " << snapshot;
1137 continue;
1138 }
1139
1140 auto result = CheckTargetMergeState(lock, snapshot, update_status);
1141 LOG(INFO) << "CheckTargetMergeState for " << snapshot
1142 << " returned: " << UpdateStateToStr(result.state);
1143
1144 switch (result.state) {
1145 case UpdateState::MergeFailed:
1146 // Take the first failure code in case other failures compound.
1147 if (failure_code == MergeFailureCode::Ok) {
1148 failure_code = result.failure_code;
1149 }
1150 break;
1151 case UpdateState::Merging:
1152 merging = true;
1153 break;
1154 case UpdateState::MergeNeedsReboot:
1155 needs_reboot = true;
1156 break;
1157 case UpdateState::MergeCompleted:
1158 break;
1159 case UpdateState::Cancelled:
1160 cancelled = true;
1161 break;
1162 case UpdateState::None:
1163 wrong_phase = true;
1164 break;
1165 default:
1166 LOG(ERROR) << "Unknown merge status for \"" << snapshot << "\": "
1167 << "\"" << result.state << "\"";
1168 if (failure_code == MergeFailureCode::Ok) {
1169 failure_code = MergeFailureCode::UnexpectedMergeState;
1170 }
1171 break;
1172 }
1173 }
1174
1175 if (merging) {
1176 // Note that we handle "Merging" before we handle anything else. We
1177 // want to poll until *nothing* is merging if we can, so everything has
1178 // a chance to get marked as completed or failed.
1179 return MergeResult(UpdateState::Merging);
1180 }
1181 if (failure_code != MergeFailureCode::Ok) {
1182 // Note: since there are many drop-out cases for failure, we acknowledge
1183 // it in WaitForMerge rather than here and elsewhere.
1184 return MergeResult(UpdateState::MergeFailed, failure_code);
1185 }
1186 if (wrong_phase) {
1187 // If we got here, no other partitions are being merged, and nothing
1188 // failed to merge. It's safe to move to the next merge phase.
1189 auto code = MergeSecondPhaseSnapshots(lock);
1190 if (code != MergeFailureCode::Ok) {
1191 return MergeResult(UpdateState::MergeFailed, code);
1192 }
1193 return MergeResult(UpdateState::Merging);
1194 }
1195 if (needs_reboot) {
1196 WriteUpdateState(lock, UpdateState::MergeNeedsReboot);
1197 return MergeResult(UpdateState::MergeNeedsReboot);
1198 }
1199 if (cancelled) {
1200 // This is an edge case, that we handle as correctly as we sensibly can.
1201 // The underlying partition has changed behind update_engine, and we've
1202 // removed the snapshot as a result. The exact state of the update is
1203 // undefined now, but this can only happen on an unlocked device where
1204 // partitions can be flashed without wiping userdata.
1205 return MergeResult(UpdateState::Cancelled);
1206 }
1207 return MergeResult(UpdateState::MergeCompleted);
1208 }
1209
CheckTargetMergeState(LockedFile * lock,const std::string & name,const SnapshotUpdateStatus & update_status)1210 auto SnapshotManager::CheckTargetMergeState(LockedFile* lock, const std::string& name,
1211 const SnapshotUpdateStatus& update_status)
1212 -> MergeResult {
1213 SnapshotStatus snapshot_status;
1214 if (!ReadSnapshotStatus(lock, name, &snapshot_status)) {
1215 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::ReadStatus);
1216 }
1217
1218 std::unique_ptr<LpMetadata> current_metadata;
1219
1220 if (!IsSnapshotDevice(name)) {
1221 if (!current_metadata) {
1222 current_metadata = ReadCurrentMetadata();
1223 }
1224
1225 if (!current_metadata ||
1226 GetMetadataPartitionState(*current_metadata, name) != MetadataPartitionState::Updated) {
1227 DeleteSnapshot(lock, name);
1228 return MergeResult(UpdateState::Cancelled);
1229 }
1230
1231 // During a check, we decided the merge was complete, but we were unable to
1232 // collapse the device-mapper stack and perform COW cleanup. If we haven't
1233 // rebooted after this check, the device will still be a snapshot-merge
1234 // target. If we have rebooted, the device will now be a linear target,
1235 // and we can try cleanup again.
1236 if (snapshot_status.state() == SnapshotState::MERGE_COMPLETED) {
1237 // NB: It's okay if this fails now, we gave cleanup our best effort.
1238 OnSnapshotMergeComplete(lock, name, snapshot_status);
1239 return MergeResult(UpdateState::MergeCompleted);
1240 }
1241
1242 LOG(ERROR) << "Expected snapshot or snapshot-merge for device: " << name;
1243 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::UnknownTargetType);
1244 }
1245
1246 // This check is expensive so it is only enabled for debugging.
1247 DCHECK((current_metadata = ReadCurrentMetadata()) &&
1248 GetMetadataPartitionState(*current_metadata, name) == MetadataPartitionState::Updated);
1249
1250 if (UpdateUsesUserSnapshots(lock)) {
1251 std::string merge_status;
1252 if (EnsureSnapuserdConnected()) {
1253 // Query the snapshot status from the daemon
1254 merge_status = snapuserd_client_->QuerySnapshotStatus(name);
1255 } else {
1256 MergeResult(UpdateState::MergeFailed, MergeFailureCode::QuerySnapshotStatus);
1257 }
1258
1259 if (merge_status == "snapshot-merge-failed") {
1260 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::UnknownTargetType);
1261 }
1262
1263 // This is the case when device reboots during merge. Once the device boots,
1264 // snapuserd daemon will not resume merge immediately in first stage init.
1265 // This is slightly different as compared to dm-snapshot-merge; In this
1266 // case, metadata file will have "MERGING" state whereas the daemon will be
1267 // waiting to resume the merge. Thus, we resume the merge at this point.
1268 if (merge_status == "snapshot" && snapshot_status.state() == SnapshotState::MERGING) {
1269 if (!snapuserd_client_->InitiateMerge(name)) {
1270 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::UnknownTargetType);
1271 }
1272 return MergeResult(UpdateState::Merging);
1273 }
1274
1275 if (merge_status == "snapshot" &&
1276 DecideMergePhase(snapshot_status) == MergePhase::SECOND_PHASE &&
1277 update_status.merge_phase() == MergePhase::FIRST_PHASE) {
1278 // The snapshot is not being merged because it's in the wrong phase.
1279 return MergeResult(UpdateState::None);
1280 }
1281
1282 if (merge_status == "snapshot-merge") {
1283 if (snapshot_status.state() == SnapshotState::MERGE_COMPLETED) {
1284 LOG(ERROR) << "Snapshot " << name
1285 << " is merging after being marked merge-complete.";
1286 return MergeResult(UpdateState::MergeFailed,
1287 MergeFailureCode::UnmergedSectorsAfterCompletion);
1288 }
1289 return MergeResult(UpdateState::Merging);
1290 }
1291
1292 if (merge_status != "snapshot-merge-complete") {
1293 LOG(ERROR) << "Snapshot " << name << " has incorrect status: " << merge_status;
1294 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::ExpectedMergeTarget);
1295 }
1296 } else {
1297 // dm-snapshot in the kernel
1298 std::string target_type;
1299 DmTargetSnapshot::Status status;
1300 if (!QuerySnapshotStatus(name, &target_type, &status)) {
1301 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::QuerySnapshotStatus);
1302 }
1303 if (target_type == "snapshot" &&
1304 DecideMergePhase(snapshot_status) == MergePhase::SECOND_PHASE &&
1305 update_status.merge_phase() == MergePhase::FIRST_PHASE) {
1306 // The snapshot is not being merged because it's in the wrong phase.
1307 return MergeResult(UpdateState::None);
1308 }
1309 if (target_type != "snapshot-merge") {
1310 // We can get here if we failed to rewrite the target type in
1311 // InitiateMerge(). If we failed to create the target in first-stage
1312 // init, boot would not succeed.
1313 LOG(ERROR) << "Snapshot " << name << " has incorrect target type: " << target_type;
1314 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::ExpectedMergeTarget);
1315 }
1316
1317 // These two values are equal when merging is complete.
1318 if (status.sectors_allocated != status.metadata_sectors) {
1319 if (snapshot_status.state() == SnapshotState::MERGE_COMPLETED) {
1320 LOG(ERROR) << "Snapshot " << name
1321 << " is merging after being marked merge-complete.";
1322 return MergeResult(UpdateState::MergeFailed,
1323 MergeFailureCode::UnmergedSectorsAfterCompletion);
1324 }
1325 return MergeResult(UpdateState::Merging);
1326 }
1327 }
1328
1329 // Merge is complete at this point
1330
1331 auto code = CheckMergeConsistency(lock, name, snapshot_status);
1332 if (code != MergeFailureCode::Ok) {
1333 return MergeResult(UpdateState::MergeFailed, code);
1334 }
1335
1336 // Merging is done. First, update the status file to indicate the merge
1337 // is complete. We do this before calling OnSnapshotMergeComplete, even
1338 // though this means the write is potentially wasted work (since in the
1339 // ideal case we'll immediately delete the file).
1340 //
1341 // This makes it simpler to reason about the next reboot: no matter what
1342 // part of cleanup failed, first-stage init won't try to create another
1343 // snapshot device for this partition.
1344 snapshot_status.set_state(SnapshotState::MERGE_COMPLETED);
1345 if (!WriteSnapshotStatus(lock, snapshot_status)) {
1346 return MergeResult(UpdateState::MergeFailed, MergeFailureCode::WriteStatus);
1347 }
1348 if (!OnSnapshotMergeComplete(lock, name, snapshot_status)) {
1349 return MergeResult(UpdateState::MergeNeedsReboot);
1350 }
1351 return MergeResult(UpdateState::MergeCompleted, MergeFailureCode::Ok);
1352 }
1353
1354 // This returns the backing device, not the dm-user layer.
GetMappedCowDeviceName(const std::string & snapshot,const SnapshotStatus & status)1355 static std::string GetMappedCowDeviceName(const std::string& snapshot,
1356 const SnapshotStatus& status) {
1357 // If no partition was created (the COW exists entirely on /data), the
1358 // device-mapper layering is different than if we had a partition.
1359 if (status.cow_partition_size() == 0) {
1360 return GetCowImageDeviceName(snapshot);
1361 }
1362 return GetCowName(snapshot);
1363 }
1364
CheckMergeConsistency(LockedFile * lock,const std::string & name,const SnapshotStatus & status)1365 MergeFailureCode SnapshotManager::CheckMergeConsistency(LockedFile* lock, const std::string& name,
1366 const SnapshotStatus& status) {
1367 CHECK(lock);
1368
1369 return merge_consistency_checker_(name, status);
1370 }
1371
CheckMergeConsistency(const std::string & name,const SnapshotStatus & status)1372 MergeFailureCode CheckMergeConsistency(const std::string& name, const SnapshotStatus& status) {
1373 if (!status.using_snapuserd()) {
1374 // Do not try to verify old-style COWs yet.
1375 return MergeFailureCode::Ok;
1376 }
1377
1378 auto& dm = DeviceMapper::Instance();
1379
1380 std::string cow_image_name = GetMappedCowDeviceName(name, status);
1381 std::string cow_image_path;
1382 if (!dm.GetDmDevicePathByName(cow_image_name, &cow_image_path)) {
1383 LOG(ERROR) << "Failed to get path for cow device: " << cow_image_name;
1384 return MergeFailureCode::GetCowPathConsistencyCheck;
1385 }
1386
1387 // First pass, count # of ops.
1388 size_t num_ops = 0;
1389 {
1390 unique_fd fd(open(cow_image_path.c_str(), O_RDONLY | O_CLOEXEC));
1391 if (fd < 0) {
1392 PLOG(ERROR) << "Failed to open " << cow_image_name;
1393 return MergeFailureCode::OpenCowConsistencyCheck;
1394 }
1395
1396 CowReader reader;
1397 if (!reader.Parse(std::move(fd))) {
1398 LOG(ERROR) << "Failed to parse cow " << cow_image_path;
1399 return MergeFailureCode::ParseCowConsistencyCheck;
1400 }
1401
1402 num_ops = reader.get_num_total_data_ops();
1403 }
1404
1405 // Second pass, try as hard as we can to get the actual number of blocks
1406 // the system thinks is merged.
1407 unique_fd fd(open(cow_image_path.c_str(), O_RDONLY | O_DIRECT | O_SYNC | O_CLOEXEC));
1408 if (fd < 0) {
1409 PLOG(ERROR) << "Failed to open direct " << cow_image_name;
1410 return MergeFailureCode::OpenCowDirectConsistencyCheck;
1411 }
1412
1413 void* addr;
1414 size_t page_size = getpagesize();
1415 if (posix_memalign(&addr, page_size, page_size) < 0) {
1416 PLOG(ERROR) << "posix_memalign with page size " << page_size;
1417 return MergeFailureCode::MemAlignConsistencyCheck;
1418 }
1419
1420 // COWs are always at least 2MB, this is guaranteed in snapshot creation.
1421 std::unique_ptr<void, decltype(&::free)> buffer(addr, ::free);
1422 if (!android::base::ReadFully(fd, buffer.get(), page_size)) {
1423 PLOG(ERROR) << "Direct read failed " << cow_image_name;
1424 return MergeFailureCode::DirectReadConsistencyCheck;
1425 }
1426
1427 auto header = reinterpret_cast<CowHeader*>(buffer.get());
1428 if (header->num_merge_ops != num_ops) {
1429 LOG(ERROR) << "COW consistency check failed, expected " << num_ops << " to be merged, "
1430 << "but " << header->num_merge_ops << " were actually recorded.";
1431 LOG(ERROR) << "Aborting merge progress for snapshot " << name
1432 << ", will try again next boot";
1433 return MergeFailureCode::WrongMergeCountConsistencyCheck;
1434 }
1435
1436 return MergeFailureCode::Ok;
1437 }
1438
MergeSecondPhaseSnapshots(LockedFile * lock)1439 MergeFailureCode SnapshotManager::MergeSecondPhaseSnapshots(LockedFile* lock) {
1440 std::vector<std::string> snapshots;
1441 if (!ListSnapshots(lock, &snapshots)) {
1442 return MergeFailureCode::ListSnapshots;
1443 }
1444
1445 SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock);
1446 CHECK(update_status.state() == UpdateState::Merging ||
1447 update_status.state() == UpdateState::MergeFailed);
1448 CHECK(update_status.merge_phase() == MergePhase::FIRST_PHASE);
1449
1450 update_status.set_state(UpdateState::Merging);
1451 update_status.set_merge_phase(MergePhase::SECOND_PHASE);
1452 if (!WriteSnapshotUpdateStatus(lock, update_status)) {
1453 return MergeFailureCode::WriteStatus;
1454 }
1455
1456 MergeFailureCode result = MergeFailureCode::Ok;
1457 for (const auto& snapshot : snapshots) {
1458 SnapshotStatus snapshot_status;
1459 if (!ReadSnapshotStatus(lock, snapshot, &snapshot_status)) {
1460 return MergeFailureCode::ReadStatus;
1461 }
1462 if (DecideMergePhase(snapshot_status) != MergePhase::SECOND_PHASE) {
1463 continue;
1464 }
1465 auto code = SwitchSnapshotToMerge(lock, snapshot);
1466 if (code != MergeFailureCode::Ok) {
1467 LOG(ERROR) << "Failed to switch snapshot to a second-phase merge target: " << snapshot;
1468 if (result == MergeFailureCode::Ok) {
1469 result = code;
1470 }
1471 }
1472 }
1473 return result;
1474 }
1475
GetSnapshotBootIndicatorPath()1476 std::string SnapshotManager::GetSnapshotBootIndicatorPath() {
1477 return metadata_dir_ + "/" + android::base::Basename(kBootIndicatorPath);
1478 }
1479
GetRollbackIndicatorPath()1480 std::string SnapshotManager::GetRollbackIndicatorPath() {
1481 return metadata_dir_ + "/" + android::base::Basename(kRollbackIndicatorPath);
1482 }
1483
GetForwardMergeIndicatorPath()1484 std::string SnapshotManager::GetForwardMergeIndicatorPath() {
1485 return metadata_dir_ + "/allow-forward-merge";
1486 }
1487
GetOldPartitionMetadataPath()1488 std::string SnapshotManager::GetOldPartitionMetadataPath() {
1489 return metadata_dir_ + "/old-partition-metadata";
1490 }
1491
AcknowledgeMergeSuccess(LockedFile * lock)1492 void SnapshotManager::AcknowledgeMergeSuccess(LockedFile* lock) {
1493 // It's not possible to remove update state in recovery, so write an
1494 // indicator that cleanup is needed on reboot. If a factory data reset
1495 // was requested, it doesn't matter, everything will get wiped anyway.
1496 // To make testing easier we consider a /data wipe as cleaned up.
1497 if (device_->IsRecovery()) {
1498 WriteUpdateState(lock, UpdateState::MergeCompleted);
1499 return;
1500 }
1501
1502 RemoveAllUpdateState(lock);
1503
1504 if (UpdateUsesUserSnapshots(lock) && !device()->IsTestDevice()) {
1505 if (snapuserd_client_) {
1506 snapuserd_client_->DetachSnapuserd();
1507 snapuserd_client_->RemoveTransitionedDaemonIndicator();
1508 snapuserd_client_ = nullptr;
1509 }
1510 }
1511 }
1512
AcknowledgeMergeFailure(MergeFailureCode failure_code)1513 void SnapshotManager::AcknowledgeMergeFailure(MergeFailureCode failure_code) {
1514 // Log first, so worst case, we always have a record of why the calls below
1515 // were being made.
1516 LOG(ERROR) << "Merge could not be completed and will be marked as failed.";
1517
1518 auto lock = LockExclusive();
1519 if (!lock) return;
1520
1521 // Since we released the lock in between WaitForMerge and here, it's
1522 // possible (1) the merge successfully completed or (2) was already
1523 // marked as a failure. So make sure to check the state again, and
1524 // only mark as a failure if appropriate.
1525 UpdateState state = ReadUpdateState(lock.get());
1526 if (state != UpdateState::Merging && state != UpdateState::MergeNeedsReboot) {
1527 return;
1528 }
1529
1530 WriteUpdateState(lock.get(), UpdateState::MergeFailed, failure_code);
1531 }
1532
OnSnapshotMergeComplete(LockedFile * lock,const std::string & name,const SnapshotStatus & status)1533 bool SnapshotManager::OnSnapshotMergeComplete(LockedFile* lock, const std::string& name,
1534 const SnapshotStatus& status) {
1535 if (!UpdateUsesUserSnapshots(lock)) {
1536 if (IsSnapshotDevice(name)) {
1537 // We are extra-cautious here, to avoid deleting the wrong table.
1538 std::string target_type;
1539 DmTargetSnapshot::Status dm_status;
1540 if (!QuerySnapshotStatus(name, &target_type, &dm_status)) {
1541 return false;
1542 }
1543 if (target_type != "snapshot-merge") {
1544 LOG(ERROR) << "Unexpected target type " << target_type
1545 << " for snapshot device: " << name;
1546 return false;
1547 }
1548 if (dm_status.sectors_allocated != dm_status.metadata_sectors) {
1549 LOG(ERROR) << "Merge is unexpectedly incomplete for device " << name;
1550 return false;
1551 }
1552 if (!CollapseSnapshotDevice(lock, name, status)) {
1553 LOG(ERROR) << "Unable to collapse snapshot: " << name;
1554 return false;
1555 }
1556 }
1557 } else {
1558 // Just collapse the device - no need to query again as we just did
1559 // prior to calling this function
1560 if (!CollapseSnapshotDevice(lock, name, status)) {
1561 LOG(ERROR) << "Unable to collapse snapshot: " << name;
1562 return false;
1563 }
1564 }
1565
1566 // Note that collapsing is implicitly an Unmap, so we don't need to
1567 // unmap the snapshot.
1568
1569 if (!DeleteSnapshot(lock, name)) {
1570 LOG(ERROR) << "Could not delete snapshot: " << name;
1571 return false;
1572 }
1573 return true;
1574 }
1575
CollapseSnapshotDevice(LockedFile * lock,const std::string & name,const SnapshotStatus & status)1576 bool SnapshotManager::CollapseSnapshotDevice(LockedFile* lock, const std::string& name,
1577 const SnapshotStatus& status) {
1578 if (!UpdateUsesUserSnapshots(lock)) {
1579 // Verify we have a snapshot-merge device.
1580 DeviceMapper::TargetInfo target;
1581 if (!GetSingleTarget(name, TableQuery::Table, &target)) {
1582 return false;
1583 }
1584 if (DeviceMapper::GetTargetType(target.spec) != "snapshot-merge") {
1585 // This should be impossible, it was checked earlier.
1586 LOG(ERROR) << "Snapshot device has invalid target type: " << name;
1587 return false;
1588 }
1589
1590 std::string base_device, cow_device;
1591 if (!DmTargetSnapshot::GetDevicesFromParams(target.data, &base_device, &cow_device)) {
1592 LOG(ERROR) << "Could not parse snapshot device " << name
1593 << " parameters: " << target.data;
1594 return false;
1595 }
1596 }
1597
1598 uint64_t snapshot_sectors = status.snapshot_size() / kSectorSize;
1599 if (snapshot_sectors * kSectorSize != status.snapshot_size()) {
1600 LOG(ERROR) << "Snapshot " << name
1601 << " size is not sector aligned: " << status.snapshot_size();
1602 return false;
1603 }
1604
1605 uint32_t slot = SlotNumberForSlotSuffix(device_->GetSlotSuffix());
1606 // Create a DmTable that is identical to the base device.
1607 CreateLogicalPartitionParams base_device_params{
1608 .block_device = device_->GetSuperDevice(slot),
1609 .metadata_slot = slot,
1610 .partition_name = name,
1611 .partition_opener = &device_->GetPartitionOpener(),
1612 };
1613 DmTable table;
1614 if (!CreateDmTable(base_device_params, &table)) {
1615 LOG(ERROR) << "Could not create a DmTable for partition: " << name;
1616 return false;
1617 }
1618
1619 if (!dm_.LoadTableAndActivate(name, table)) {
1620 return false;
1621 }
1622
1623 if (!UpdateUsesUserSnapshots(lock)) {
1624 // Attempt to delete the snapshot device if one still exists. Nothing
1625 // should be depending on the device, and device-mapper should have
1626 // flushed remaining I/O. We could in theory replace with dm-zero (or
1627 // re-use the table above), but for now it's better to know why this
1628 // would fail.
1629 //
1630 // Furthermore, we should not be trying to unmap for userspace snapshot
1631 // as unmap will fail since dm-user itself was a snapshot device prior
1632 // to switching of tables. Unmap will fail as the device will be mounted
1633 // by system partitions
1634 if (status.using_snapuserd()) {
1635 auto dm_user_name = GetDmUserCowName(name, GetSnapshotDriver(lock));
1636 UnmapDmUserDevice(dm_user_name);
1637 }
1638 }
1639
1640 // We can't delete base device immediately as daemon holds a reference.
1641 // Make sure we wait for all the worker threads to terminate and release
1642 // the reference
1643 if (UpdateUsesUserSnapshots(lock) && EnsureSnapuserdConnected()) {
1644 if (!snapuserd_client_->WaitForDeviceDelete(name)) {
1645 LOG(ERROR) << "Failed to wait for " << name << " control device to delete";
1646 }
1647 }
1648
1649 auto base_name = GetBaseDeviceName(name);
1650 if (!DeleteDeviceIfExists(base_name)) {
1651 LOG(ERROR) << "Unable to delete base device for snapshot: " << base_name;
1652 }
1653
1654 if (!DeleteDeviceIfExists(GetSourceDeviceName(name), 4000ms)) {
1655 LOG(ERROR) << "Unable to delete source device for snapshot: " << GetSourceDeviceName(name);
1656 }
1657
1658 return true;
1659 }
1660
HandleCancelledUpdate(LockedFile * lock,const std::function<bool ()> & before_cancel)1661 bool SnapshotManager::HandleCancelledUpdate(LockedFile* lock,
1662 const std::function<bool()>& before_cancel) {
1663 auto slot = GetCurrentSlot();
1664 if (slot == Slot::Unknown) {
1665 return false;
1666 }
1667
1668 // If all snapshots were reflashed, then cancel the entire update.
1669 if (AreAllSnapshotsCancelled(lock)) {
1670 LOG(WARNING) << "Detected re-flashing, cancelling unverified update.";
1671 return RemoveAllUpdateState(lock, before_cancel);
1672 }
1673
1674 // If update has been rolled back, then cancel the entire update.
1675 // Client (update_engine) is responsible for doing additional cleanup work on its own states
1676 // when ProcessUpdateState() returns UpdateState::Cancelled.
1677 auto current_slot = GetCurrentSlot();
1678 if (current_slot != Slot::Source) {
1679 LOG(INFO) << "Update state is being processed while booting at " << current_slot
1680 << " slot, taking no action.";
1681 return false;
1682 }
1683
1684 // current_slot == Source. Attempt to detect rollbacks.
1685 if (access(GetRollbackIndicatorPath().c_str(), F_OK) != 0) {
1686 // This unverified update is not attempted. Take no action.
1687 PLOG(INFO) << "Rollback indicator not detected. "
1688 << "Update state is being processed before reboot, taking no action.";
1689 return false;
1690 }
1691
1692 LOG(WARNING) << "Detected rollback, cancelling unverified update.";
1693 return RemoveAllUpdateState(lock, before_cancel);
1694 }
1695
PerformInitTransition(InitTransition transition,std::vector<std::string> * snapuserd_argv)1696 bool SnapshotManager::PerformInitTransition(InitTransition transition,
1697 std::vector<std::string>* snapuserd_argv) {
1698 LOG(INFO) << "Performing transition for snapuserd.";
1699
1700 // Don't use EnsureSnapuserdConnected() because this is called from init,
1701 // and attempting to do so will deadlock.
1702 if (!snapuserd_client_ && transition != InitTransition::SELINUX_DETACH) {
1703 snapuserd_client_ = SnapuserdClient::Connect(kSnapuserdSocket, 10s);
1704 if (!snapuserd_client_) {
1705 LOG(ERROR) << "Unable to connect to snapuserd";
1706 return false;
1707 }
1708 }
1709
1710 auto lock = LockExclusive();
1711 if (!lock) return false;
1712
1713 std::vector<std::string> snapshots;
1714 if (!ListSnapshots(lock.get(), &snapshots)) {
1715 LOG(ERROR) << "Failed to list snapshots.";
1716 return false;
1717 }
1718
1719 if (UpdateUsesUserSnapshots(lock.get()) && transition == InitTransition::SELINUX_DETACH) {
1720 snapuserd_argv->emplace_back("-user_snapshot");
1721 if (UpdateUsesIouring(lock.get())) {
1722 snapuserd_argv->emplace_back("-io_uring");
1723 }
1724 }
1725
1726 size_t num_cows = 0;
1727 size_t ok_cows = 0;
1728 for (const auto& snapshot : snapshots) {
1729 std::string user_cow_name = GetDmUserCowName(snapshot, GetSnapshotDriver(lock.get()));
1730
1731 if (dm_.GetState(user_cow_name) == DmDeviceState::INVALID) {
1732 continue;
1733 }
1734
1735 DeviceMapper::TargetInfo target;
1736 if (!GetSingleTarget(user_cow_name, TableQuery::Table, &target)) {
1737 continue;
1738 }
1739
1740 auto target_type = DeviceMapper::GetTargetType(target.spec);
1741 if (target_type != "user") {
1742 LOG(ERROR) << "Unexpected target type for " << user_cow_name << ": " << target_type;
1743 continue;
1744 }
1745
1746 num_cows++;
1747
1748 SnapshotStatus snapshot_status;
1749 if (!ReadSnapshotStatus(lock.get(), snapshot, &snapshot_status)) {
1750 LOG(ERROR) << "Unable to read snapshot status: " << snapshot;
1751 continue;
1752 }
1753
1754 auto misc_name = user_cow_name;
1755
1756 std::string source_device_name;
1757 if (snapshot_status.old_partition_size() > 0) {
1758 source_device_name = GetSourceDeviceName(snapshot);
1759 } else {
1760 source_device_name = GetBaseDeviceName(snapshot);
1761 }
1762
1763 std::string source_device;
1764 if (!dm_.GetDmDevicePathByName(source_device_name, &source_device)) {
1765 LOG(ERROR) << "Could not get device path for " << GetSourceDeviceName(snapshot);
1766 continue;
1767 }
1768
1769 std::string base_path_merge;
1770 if (!dm_.GetDmDevicePathByName(GetBaseDeviceName(snapshot), &base_path_merge)) {
1771 LOG(ERROR) << "Could not get device path for " << GetSourceDeviceName(snapshot);
1772 continue;
1773 }
1774
1775 std::string cow_image_name = GetMappedCowDeviceName(snapshot, snapshot_status);
1776
1777 std::string cow_image_device;
1778 if (!dm_.GetDmDevicePathByName(cow_image_name, &cow_image_device)) {
1779 LOG(ERROR) << "Could not get device path for " << cow_image_name;
1780 continue;
1781 }
1782
1783 if (transition == InitTransition::SELINUX_DETACH) {
1784 if (!UpdateUsesUserSnapshots(lock.get())) {
1785 auto message = misc_name + "," + cow_image_device + "," + source_device;
1786 snapuserd_argv->emplace_back(std::move(message));
1787 } else {
1788 auto message = misc_name + "," + cow_image_device + "," + source_device + "," +
1789 base_path_merge;
1790 snapuserd_argv->emplace_back(std::move(message));
1791 }
1792
1793 // Do not attempt to connect to the new snapuserd yet, it hasn't
1794 // been started. We do however want to wait for the misc device
1795 // to have been created.
1796 ok_cows++;
1797 continue;
1798 }
1799
1800 DmTable table;
1801 table.Emplace<DmTargetUser>(0, target.spec.length, misc_name);
1802 if (!dm_.LoadTableAndActivate(user_cow_name, table)) {
1803 LOG(ERROR) << "Unable to swap tables for " << misc_name;
1804 continue;
1805 }
1806
1807 // Wait for ueventd to acknowledge and create the control device node.
1808 std::string control_device = "/dev/dm-user/" + misc_name;
1809 if (!WaitForDevice(control_device, 10s)) {
1810 LOG(ERROR) << "dm-user control device no found: " << misc_name;
1811 continue;
1812 }
1813
1814 uint64_t base_sectors;
1815 if (!UpdateUsesUserSnapshots(lock.get())) {
1816 base_sectors =
1817 snapuserd_client_->InitDmUserCow(misc_name, cow_image_device, source_device);
1818 } else {
1819 base_sectors = snapuserd_client_->InitDmUserCow(misc_name, cow_image_device,
1820 source_device, base_path_merge);
1821 }
1822
1823 if (base_sectors == 0) {
1824 // Unrecoverable as metadata reads from cow device failed
1825 LOG(FATAL) << "Failed to retrieve base_sectors from Snapuserd";
1826 return false;
1827 }
1828
1829 CHECK(base_sectors <= target.spec.length);
1830
1831 if (!snapuserd_client_->AttachDmUser(misc_name)) {
1832 // This error is unrecoverable. We cannot proceed because reads to
1833 // the underlying device will fail.
1834 LOG(FATAL) << "Could not initialize snapuserd for " << user_cow_name;
1835 return false;
1836 }
1837
1838 ok_cows++;
1839 }
1840
1841 if (ok_cows != num_cows) {
1842 LOG(ERROR) << "Could not transition all snapuserd consumers.";
1843 return false;
1844 }
1845 return true;
1846 }
1847
ReadCurrentMetadata()1848 std::unique_ptr<LpMetadata> SnapshotManager::ReadCurrentMetadata() {
1849 const auto& opener = device_->GetPartitionOpener();
1850 uint32_t slot = SlotNumberForSlotSuffix(device_->GetSlotSuffix());
1851 auto super_device = device_->GetSuperDevice(slot);
1852 auto metadata = android::fs_mgr::ReadMetadata(opener, super_device, slot);
1853 if (!metadata) {
1854 LOG(ERROR) << "Could not read dynamic partition metadata for device: " << super_device;
1855 return nullptr;
1856 }
1857 return metadata;
1858 }
1859
GetMetadataPartitionState(const LpMetadata & metadata,const std::string & name)1860 SnapshotManager::MetadataPartitionState SnapshotManager::GetMetadataPartitionState(
1861 const LpMetadata& metadata, const std::string& name) {
1862 auto partition = android::fs_mgr::FindPartition(metadata, name);
1863 if (!partition) return MetadataPartitionState::None;
1864 if (partition->attributes & LP_PARTITION_ATTR_UPDATED) {
1865 return MetadataPartitionState::Updated;
1866 }
1867 return MetadataPartitionState::Flashed;
1868 }
1869
AreAllSnapshotsCancelled(LockedFile * lock)1870 bool SnapshotManager::AreAllSnapshotsCancelled(LockedFile* lock) {
1871 std::vector<std::string> snapshots;
1872 if (!ListSnapshots(lock, &snapshots)) {
1873 LOG(WARNING) << "Failed to list snapshots to determine whether device has been flashed "
1874 << "after applying an update. Assuming no snapshots.";
1875 // Let HandleCancelledUpdate resets UpdateState.
1876 return true;
1877 }
1878
1879 std::map<std::string, bool> flashing_status;
1880
1881 if (!GetSnapshotFlashingStatus(lock, snapshots, &flashing_status)) {
1882 LOG(WARNING) << "Failed to determine whether partitions have been flashed. Not"
1883 << "removing update states.";
1884 return false;
1885 }
1886
1887 bool all_snapshots_cancelled = std::all_of(flashing_status.begin(), flashing_status.end(),
1888 [](const auto& pair) { return pair.second; });
1889
1890 if (all_snapshots_cancelled) {
1891 LOG(WARNING) << "All partitions are re-flashed after update, removing all update states.";
1892 }
1893 return all_snapshots_cancelled;
1894 }
1895
GetSnapshotFlashingStatus(LockedFile * lock,const std::vector<std::string> & snapshots,std::map<std::string,bool> * out)1896 bool SnapshotManager::GetSnapshotFlashingStatus(LockedFile* lock,
1897 const std::vector<std::string>& snapshots,
1898 std::map<std::string, bool>* out) {
1899 CHECK(lock);
1900
1901 auto source_slot_suffix = ReadUpdateSourceSlotSuffix();
1902 if (source_slot_suffix.empty()) {
1903 return false;
1904 }
1905 uint32_t source_slot = SlotNumberForSlotSuffix(source_slot_suffix);
1906 uint32_t target_slot = (source_slot == 0) ? 1 : 0;
1907
1908 // Attempt to detect re-flashing on each partition.
1909 // - If all partitions are re-flashed, we can proceed to cancel the whole update.
1910 // - If only some of the partitions are re-flashed, snapshots for re-flashed partitions are
1911 // deleted. Caller is responsible for merging the rest of the snapshots.
1912 // - If none of the partitions are re-flashed, caller is responsible for merging the snapshots.
1913 //
1914 // Note that we use target slot metadata, since if an OTA has been applied
1915 // to the target slot, we can detect the UPDATED flag. Any kind of flash
1916 // operation against dynamic partitions ensures that all copies of the
1917 // metadata are in sync, so flashing all partitions on the source slot will
1918 // remove the UPDATED flag on the target slot as well.
1919 const auto& opener = device_->GetPartitionOpener();
1920 auto super_device = device_->GetSuperDevice(target_slot);
1921 auto metadata = android::fs_mgr::ReadMetadata(opener, super_device, target_slot);
1922 if (!metadata) {
1923 return false;
1924 }
1925
1926 for (const auto& snapshot_name : snapshots) {
1927 if (GetMetadataPartitionState(*metadata, snapshot_name) ==
1928 MetadataPartitionState::Updated) {
1929 out->emplace(snapshot_name, false);
1930 } else {
1931 // Delete snapshots for partitions that are re-flashed after the update.
1932 LOG(WARNING) << "Detected re-flashing of partition " << snapshot_name << ".";
1933 out->emplace(snapshot_name, true);
1934 }
1935 }
1936 return true;
1937 }
1938
RemoveInvalidSnapshots(LockedFile * lock)1939 void SnapshotManager::RemoveInvalidSnapshots(LockedFile* lock) {
1940 std::vector<std::string> snapshots;
1941
1942 // Remove the stale snapshot metadata
1943 //
1944 // We make sure that all the three cases
1945 // are valid before removing the snapshot metadata:
1946 //
1947 // 1: dm state is active
1948 // 2: Root fs is not mounted off as a snapshot device
1949 // 3: Snapshot slot suffix should match current device slot
1950 if (!ListSnapshots(lock, &snapshots, device_->GetSlotSuffix()) || snapshots.empty()) {
1951 return;
1952 }
1953
1954 // We indeed have some invalid snapshots
1955 for (const auto& name : snapshots) {
1956 if (dm_.GetState(name) == DmDeviceState::ACTIVE && !IsSnapshotDevice(name)) {
1957 if (!DeleteSnapshot(lock, name)) {
1958 LOG(ERROR) << "Failed to delete invalid snapshot: " << name;
1959 } else {
1960 LOG(INFO) << "Invalid snapshot: " << name << " deleted";
1961 }
1962 }
1963 }
1964 }
1965
RemoveAllSnapshots(LockedFile * lock)1966 bool SnapshotManager::RemoveAllSnapshots(LockedFile* lock) {
1967 std::vector<std::string> snapshots;
1968 if (!ListSnapshots(lock, &snapshots)) {
1969 LOG(ERROR) << "Could not list snapshots";
1970 return false;
1971 }
1972
1973 std::map<std::string, bool> flashing_status;
1974 if (!GetSnapshotFlashingStatus(lock, snapshots, &flashing_status)) {
1975 LOG(WARNING) << "Failed to get flashing status";
1976 }
1977
1978 auto current_slot = GetCurrentSlot();
1979 bool ok = true;
1980 bool has_mapped_cow_images = false;
1981 for (const auto& name : snapshots) {
1982 // If booting off source slot, it is okay to unmap and delete all the snapshots.
1983 // If boot indicator is missing, update state is None or Initiated, so
1984 // it is also okay to unmap and delete all the snapshots.
1985 // If booting off target slot,
1986 // - should not unmap because:
1987 // - In Android mode, snapshots are not mapped, but
1988 // filesystems are mounting off dm-linear targets directly.
1989 // - In recovery mode, assume nothing is mapped, so it is optional to unmap.
1990 // - If partition is flashed or unknown, it is okay to delete snapshots.
1991 // Otherwise (UPDATED flag), only delete snapshots if they are not mapped
1992 // as dm-snapshot (for example, after merge completes).
1993 bool should_unmap = current_slot != Slot::Target;
1994 bool should_delete = ShouldDeleteSnapshot(flashing_status, current_slot, name);
1995 if (should_unmap && android::base::EndsWith(name, device_->GetSlotSuffix())) {
1996 // Something very unexpected has happened - we want to unmap this
1997 // snapshot, but it's on the wrong slot. We can't unmap an active
1998 // partition. If this is not really a snapshot, skip the unmap
1999 // step.
2000 if (dm_.GetState(name) == DmDeviceState::INVALID || !IsSnapshotDevice(name)) {
2001 LOG(ERROR) << "Detected snapshot " << name << " on " << current_slot << " slot"
2002 << " for source partition; removing without unmap.";
2003 should_unmap = false;
2004 }
2005 }
2006
2007 bool partition_ok = true;
2008 if (should_unmap && !UnmapPartitionWithSnapshot(lock, name)) {
2009 partition_ok = false;
2010 }
2011 if (partition_ok && should_delete && !DeleteSnapshot(lock, name)) {
2012 partition_ok = false;
2013 }
2014
2015 if (!partition_ok) {
2016 // Remember whether or not we were able to unmap the cow image.
2017 auto cow_image_device = GetCowImageDeviceName(name);
2018 has_mapped_cow_images |=
2019 (EnsureImageManager() && images_->IsImageMapped(cow_image_device));
2020
2021 ok = false;
2022 }
2023 }
2024
2025 if (ok || !has_mapped_cow_images) {
2026 // Delete any image artifacts as a precaution, in case an update is
2027 // being cancelled due to some corrupted state in an lp_metadata file.
2028 // Note that we do not do this if some cow images are still mapped,
2029 // since we must not remove backing storage if it's in use.
2030 if (!EnsureImageManager() || !images_->RemoveAllImages()) {
2031 LOG(ERROR) << "Could not remove all snapshot artifacts";
2032 return false;
2033 }
2034 }
2035 return ok;
2036 }
2037
2038 // See comments in RemoveAllSnapshots().
ShouldDeleteSnapshot(const std::map<std::string,bool> & flashing_status,Slot current_slot,const std::string & name)2039 bool SnapshotManager::ShouldDeleteSnapshot(const std::map<std::string, bool>& flashing_status,
2040 Slot current_slot, const std::string& name) {
2041 if (current_slot != Slot::Target) {
2042 return true;
2043 }
2044 auto it = flashing_status.find(name);
2045 if (it == flashing_status.end()) {
2046 LOG(WARNING) << "Can't determine flashing status for " << name;
2047 return true;
2048 }
2049 if (it->second) {
2050 // partition flashed, okay to delete obsolete snapshots
2051 return true;
2052 }
2053 return !IsSnapshotDevice(name);
2054 }
2055
GetUpdateState(double * progress)2056 UpdateState SnapshotManager::GetUpdateState(double* progress) {
2057 // If we've never started an update, the state file won't exist.
2058 auto state_file = GetStateFilePath();
2059 if (access(state_file.c_str(), F_OK) != 0 && errno == ENOENT) {
2060 return UpdateState::None;
2061 }
2062
2063 auto lock = LockShared();
2064 if (!lock) {
2065 return UpdateState::None;
2066 }
2067
2068 SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock.get());
2069 auto state = update_status.state();
2070 if (progress == nullptr) {
2071 return state;
2072 }
2073
2074 if (state == UpdateState::MergeCompleted) {
2075 *progress = 100.0;
2076 return state;
2077 }
2078
2079 *progress = 0.0;
2080 if (state != UpdateState::Merging) {
2081 return state;
2082 }
2083
2084 if (!UpdateUsesUserSnapshots(lock.get())) {
2085 // Sum all the snapshot states as if the system consists of a single huge
2086 // snapshots device, then compute the merge completion percentage of that
2087 // device.
2088 std::vector<std::string> snapshots;
2089 if (!ListSnapshots(lock.get(), &snapshots)) {
2090 LOG(ERROR) << "Could not list snapshots";
2091 return state;
2092 }
2093
2094 DmTargetSnapshot::Status fake_snapshots_status = {};
2095 for (const auto& snapshot : snapshots) {
2096 DmTargetSnapshot::Status current_status;
2097
2098 if (!IsSnapshotDevice(snapshot)) continue;
2099 if (!QuerySnapshotStatus(snapshot, nullptr, ¤t_status)) continue;
2100
2101 fake_snapshots_status.sectors_allocated += current_status.sectors_allocated;
2102 fake_snapshots_status.total_sectors += current_status.total_sectors;
2103 fake_snapshots_status.metadata_sectors += current_status.metadata_sectors;
2104 }
2105
2106 *progress = DmTargetSnapshot::MergePercent(fake_snapshots_status,
2107 update_status.sectors_allocated());
2108 } else {
2109 if (EnsureSnapuserdConnected()) {
2110 *progress = snapuserd_client_->GetMergePercent();
2111 }
2112 }
2113
2114 return state;
2115 }
2116
UpdateUsesCompression()2117 bool SnapshotManager::UpdateUsesCompression() {
2118 auto lock = LockShared();
2119 if (!lock) return false;
2120 return UpdateUsesCompression(lock.get());
2121 }
2122
UpdateUsesCompression(LockedFile * lock)2123 bool SnapshotManager::UpdateUsesCompression(LockedFile* lock) {
2124 // This returns true even if compression is "none", since update_engine is
2125 // really just trying to see if snapuserd is in use.
2126 SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock);
2127 return update_status.using_snapuserd();
2128 }
2129
UpdateUsesIouring(LockedFile * lock)2130 bool SnapshotManager::UpdateUsesIouring(LockedFile* lock) {
2131 SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock);
2132 return update_status.io_uring_enabled();
2133 }
2134
UpdateUsesUserSnapshots()2135 bool SnapshotManager::UpdateUsesUserSnapshots() {
2136 // This and the following function is constantly
2137 // invoked during snapshot merge. We want to avoid
2138 // constantly reading from disk. Hence, store this
2139 // value in memory.
2140 //
2141 // Furthermore, this value in the disk is set
2142 // only when OTA is applied and doesn't change
2143 // during merge phase. Hence, once we know that
2144 // the value is read from disk the very first time,
2145 // it is safe to read successive checks from memory.
2146 if (is_snapshot_userspace_.has_value()) {
2147 return is_snapshot_userspace_.value();
2148 }
2149
2150 auto lock = LockShared();
2151 if (!lock) return false;
2152
2153 return UpdateUsesUserSnapshots(lock.get());
2154 }
2155
UpdateUsesUserSnapshots(LockedFile * lock)2156 bool SnapshotManager::UpdateUsesUserSnapshots(LockedFile* lock) {
2157 // See UpdateUsesUserSnapshots()
2158 if (is_snapshot_userspace_.has_value()) {
2159 return is_snapshot_userspace_.value();
2160 }
2161
2162 SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock);
2163 is_snapshot_userspace_ = update_status.userspace_snapshots();
2164 return is_snapshot_userspace_.value();
2165 }
2166
ListSnapshots(LockedFile * lock,std::vector<std::string> * snapshots,const std::string & suffix)2167 bool SnapshotManager::ListSnapshots(LockedFile* lock, std::vector<std::string>* snapshots,
2168 const std::string& suffix) {
2169 CHECK(lock);
2170
2171 auto dir_path = metadata_dir_ + "/snapshots"s;
2172 std::unique_ptr<DIR, decltype(&closedir)> dir(opendir(dir_path.c_str()), closedir);
2173 if (!dir) {
2174 PLOG(ERROR) << "opendir failed: " << dir_path;
2175 return false;
2176 }
2177
2178 struct dirent* dp;
2179 while ((dp = readdir(dir.get())) != nullptr) {
2180 if (dp->d_type != DT_REG) continue;
2181
2182 std::string name(dp->d_name);
2183 if (!suffix.empty() && !android::base::EndsWith(name, suffix)) {
2184 continue;
2185 }
2186
2187 // Insert system and product partition at the beginning so that
2188 // during snapshot-merge, these partitions are merged first.
2189 if (name == "system_a" || name == "system_b" || name == "product_a" ||
2190 name == "product_b") {
2191 snapshots->insert(snapshots->begin(), std::move(name));
2192 } else {
2193 snapshots->emplace_back(std::move(name));
2194 }
2195 }
2196
2197 return true;
2198 }
2199
IsSnapshotManagerNeeded()2200 bool SnapshotManager::IsSnapshotManagerNeeded() {
2201 return access(kBootIndicatorPath, F_OK) == 0;
2202 }
2203
GetGlobalRollbackIndicatorPath()2204 std::string SnapshotManager::GetGlobalRollbackIndicatorPath() {
2205 return kRollbackIndicatorPath;
2206 }
2207
NeedSnapshotsInFirstStageMount()2208 bool SnapshotManager::NeedSnapshotsInFirstStageMount() {
2209 // If we fail to read, we'll wind up using CreateLogicalPartitions, which
2210 // will create devices that look like the old slot, except with extra
2211 // content at the end of each device. This will confuse dm-verity, and
2212 // ultimately we'll fail to boot. Why not make it a fatal error and have
2213 // the reason be clearer? Because the indicator file still exists, and
2214 // if this was FATAL, reverting to the old slot would be broken.
2215 auto slot = GetCurrentSlot();
2216
2217 if (slot != Slot::Target) {
2218 if (slot == Slot::Source) {
2219 // Device is rebooting into the original slot, so mark this as a
2220 // rollback.
2221 auto path = GetRollbackIndicatorPath();
2222 if (!android::base::WriteStringToFile("1", path)) {
2223 PLOG(ERROR) << "Unable to write rollback indicator: " << path;
2224 } else {
2225 LOG(INFO) << "Rollback detected, writing rollback indicator to " << path;
2226 }
2227 }
2228 LOG(INFO) << "Not booting from new slot. Will not mount snapshots.";
2229 return false;
2230 }
2231
2232 // If we can't read the update state, it's unlikely anything else will
2233 // succeed, so this is a fatal error. We'll eventually exhaust boot
2234 // attempts and revert to the old slot.
2235 auto lock = LockShared();
2236 if (!lock) {
2237 LOG(FATAL) << "Could not read update state to determine snapshot status";
2238 return false;
2239 }
2240 switch (ReadUpdateState(lock.get())) {
2241 case UpdateState::Unverified:
2242 case UpdateState::Merging:
2243 case UpdateState::MergeFailed:
2244 return true;
2245 default:
2246 return false;
2247 }
2248 }
2249
CreateLogicalAndSnapshotPartitions(const std::string & super_device,const std::chrono::milliseconds & timeout_ms)2250 bool SnapshotManager::CreateLogicalAndSnapshotPartitions(
2251 const std::string& super_device, const std::chrono::milliseconds& timeout_ms) {
2252 LOG(INFO) << "Creating logical partitions with snapshots as needed";
2253
2254 auto lock = LockExclusive();
2255 if (!lock) return false;
2256
2257 uint32_t slot = SlotNumberForSlotSuffix(device_->GetSlotSuffix());
2258 return MapAllPartitions(lock.get(), super_device, slot, timeout_ms);
2259 }
2260
MapAllPartitions(LockedFile * lock,const std::string & super_device,uint32_t slot,const std::chrono::milliseconds & timeout_ms)2261 bool SnapshotManager::MapAllPartitions(LockedFile* lock, const std::string& super_device,
2262 uint32_t slot, const std::chrono::milliseconds& timeout_ms) {
2263 const auto& opener = device_->GetPartitionOpener();
2264 auto metadata = android::fs_mgr::ReadMetadata(opener, super_device, slot);
2265 if (!metadata) {
2266 LOG(ERROR) << "Could not read dynamic partition metadata for device: " << super_device;
2267 return false;
2268 }
2269
2270 if (!EnsureImageManager()) {
2271 return false;
2272 }
2273
2274 for (const auto& partition : metadata->partitions) {
2275 if (GetPartitionGroupName(metadata->groups[partition.group_index]) == kCowGroupName) {
2276 LOG(INFO) << "Skip mapping partition " << GetPartitionName(partition) << " in group "
2277 << kCowGroupName;
2278 continue;
2279 }
2280
2281 CreateLogicalPartitionParams params = {
2282 .block_device = super_device,
2283 .metadata = metadata.get(),
2284 .partition = &partition,
2285 .timeout_ms = timeout_ms,
2286 .partition_opener = &opener,
2287 };
2288 if (!MapPartitionWithSnapshot(lock, std::move(params), SnapshotContext::Mount, nullptr)) {
2289 return false;
2290 }
2291 }
2292
2293 LOG(INFO) << "Created logical partitions with snapshot.";
2294 return true;
2295 }
2296
GetRemainingTime(const std::chrono::milliseconds & timeout,const std::chrono::time_point<std::chrono::steady_clock> & begin)2297 static std::chrono::milliseconds GetRemainingTime(
2298 const std::chrono::milliseconds& timeout,
2299 const std::chrono::time_point<std::chrono::steady_clock>& begin) {
2300 // If no timeout is specified, execute all commands without specifying any timeout.
2301 if (timeout.count() == 0) return std::chrono::milliseconds(0);
2302 auto passed_time = std::chrono::steady_clock::now() - begin;
2303 auto remaining_time = timeout - duration_cast<std::chrono::milliseconds>(passed_time);
2304 if (remaining_time.count() <= 0) {
2305 LOG(ERROR) << "MapPartitionWithSnapshot has reached timeout " << timeout.count() << "ms ("
2306 << remaining_time.count() << "ms remaining)";
2307 // Return min() instead of remaining_time here because 0 is treated as a special value for
2308 // no timeout, where the rest of the commands will still be executed.
2309 return std::chrono::milliseconds::min();
2310 }
2311 return remaining_time;
2312 }
2313
MapPartitionWithSnapshot(LockedFile * lock,CreateLogicalPartitionParams params,SnapshotContext context,SnapshotPaths * paths)2314 bool SnapshotManager::MapPartitionWithSnapshot(LockedFile* lock,
2315 CreateLogicalPartitionParams params,
2316 SnapshotContext context, SnapshotPaths* paths) {
2317 auto begin = std::chrono::steady_clock::now();
2318
2319 CHECK(lock);
2320
2321 if (params.GetPartitionName() != params.GetDeviceName()) {
2322 LOG(ERROR) << "Mapping snapshot with a different name is unsupported: partition_name = "
2323 << params.GetPartitionName() << ", device_name = " << params.GetDeviceName();
2324 return false;
2325 }
2326
2327 // Fill out fields in CreateLogicalPartitionParams so that we have more information (e.g. by
2328 // reading super partition metadata).
2329 CreateLogicalPartitionParams::OwnedData params_owned_data;
2330 if (!params.InitDefaults(¶ms_owned_data)) {
2331 return false;
2332 }
2333
2334 if (!params.partition->num_extents) {
2335 LOG(INFO) << "Skipping zero-length logical partition: " << params.GetPartitionName();
2336 return true; // leave path empty to indicate that nothing is mapped.
2337 }
2338
2339 // Determine if there is a live snapshot for the SnapshotStatus of the partition; i.e. if the
2340 // partition still has a snapshot that needs to be mapped. If no live snapshot or merge
2341 // completed, live_snapshot_status is set to nullopt.
2342 std::optional<SnapshotStatus> live_snapshot_status;
2343 do {
2344 if (!(params.partition->attributes & LP_PARTITION_ATTR_UPDATED)) {
2345 LOG(INFO) << "Detected re-flashing of partition, will skip snapshot: "
2346 << params.GetPartitionName();
2347 break;
2348 }
2349 auto file_path = GetSnapshotStatusFilePath(params.GetPartitionName());
2350 if (access(file_path.c_str(), F_OK) != 0) {
2351 if (errno != ENOENT) {
2352 PLOG(INFO) << "Can't map snapshot for " << params.GetPartitionName()
2353 << ": Can't access " << file_path;
2354 return false;
2355 }
2356 break;
2357 }
2358 live_snapshot_status = std::make_optional<SnapshotStatus>();
2359 if (!ReadSnapshotStatus(lock, params.GetPartitionName(), &*live_snapshot_status)) {
2360 return false;
2361 }
2362 // No live snapshot if merge is completed.
2363 if (live_snapshot_status->state() == SnapshotState::MERGE_COMPLETED) {
2364 live_snapshot_status.reset();
2365 }
2366
2367 if (live_snapshot_status->state() == SnapshotState::NONE ||
2368 live_snapshot_status->cow_partition_size() + live_snapshot_status->cow_file_size() ==
2369 0) {
2370 LOG(WARNING) << "Snapshot status for " << params.GetPartitionName()
2371 << " is invalid, ignoring: state = "
2372 << SnapshotState_Name(live_snapshot_status->state())
2373 << ", cow_partition_size = " << live_snapshot_status->cow_partition_size()
2374 << ", cow_file_size = " << live_snapshot_status->cow_file_size();
2375 live_snapshot_status.reset();
2376 }
2377 } while (0);
2378
2379 if (live_snapshot_status.has_value()) {
2380 // dm-snapshot requires the base device to be writable.
2381 params.force_writable = true;
2382 // Map the base device with a different name to avoid collision.
2383 params.device_name = GetBaseDeviceName(params.GetPartitionName());
2384 }
2385
2386 AutoDeviceList created_devices;
2387
2388 // Create the base device for the snapshot, or if there is no snapshot, the
2389 // device itself. This device consists of the real blocks in the super
2390 // partition that this logical partition occupies.
2391 std::string base_path;
2392 if (!CreateLogicalPartition(params, &base_path)) {
2393 LOG(ERROR) << "Could not create logical partition " << params.GetPartitionName()
2394 << " as device " << params.GetDeviceName();
2395 return false;
2396 }
2397 created_devices.EmplaceBack<AutoUnmapDevice>(&dm_, params.GetDeviceName());
2398
2399 if (paths) {
2400 paths->target_device = base_path;
2401 }
2402
2403 auto remaining_time = GetRemainingTime(params.timeout_ms, begin);
2404 if (remaining_time.count() < 0) {
2405 return false;
2406 }
2407
2408 // Wait for the base device to appear
2409 if (!WaitForDevice(base_path, remaining_time)) {
2410 return false;
2411 }
2412
2413 if (!live_snapshot_status.has_value()) {
2414 created_devices.Release();
2415 return true;
2416 }
2417
2418 // We don't have ueventd in first-stage init, so use device major:minor
2419 // strings instead.
2420 std::string base_device;
2421 if (!dm_.GetDeviceString(params.GetDeviceName(), &base_device)) {
2422 LOG(ERROR) << "Could not determine major/minor for: " << params.GetDeviceName();
2423 return false;
2424 }
2425
2426 remaining_time = GetRemainingTime(params.timeout_ms, begin);
2427 if (remaining_time.count() < 0) return false;
2428
2429 std::string cow_name;
2430 CreateLogicalPartitionParams cow_params = params;
2431 cow_params.timeout_ms = remaining_time;
2432 if (!MapCowDevices(lock, cow_params, *live_snapshot_status, &created_devices, &cow_name)) {
2433 return false;
2434 }
2435 std::string cow_device;
2436 if (!GetMappedImageDeviceStringOrPath(cow_name, &cow_device)) {
2437 LOG(ERROR) << "Could not determine major/minor for: " << cow_name;
2438 return false;
2439 }
2440 if (paths) {
2441 paths->cow_device_name = cow_name;
2442 }
2443
2444 remaining_time = GetRemainingTime(params.timeout_ms, begin);
2445 if (remaining_time.count() < 0) return false;
2446
2447 if (context == SnapshotContext::Update && live_snapshot_status->using_snapuserd()) {
2448 // Stop here, we can't run dm-user yet, the COW isn't built.
2449 created_devices.Release();
2450 return true;
2451 }
2452
2453 if (live_snapshot_status->using_snapuserd()) {
2454 // Get the source device (eg the view of the partition from before it was resized).
2455 std::string source_device_path;
2456 if (live_snapshot_status->old_partition_size() > 0) {
2457 if (!MapSourceDevice(lock, params.GetPartitionName(), remaining_time,
2458 &source_device_path)) {
2459 LOG(ERROR) << "Could not map source device for: " << cow_name;
2460 return false;
2461 }
2462
2463 auto source_device = GetSourceDeviceName(params.GetPartitionName());
2464 created_devices.EmplaceBack<AutoUnmapDevice>(&dm_, source_device);
2465 } else {
2466 source_device_path = base_path;
2467 }
2468
2469 if (!WaitForDevice(source_device_path, remaining_time)) {
2470 return false;
2471 }
2472
2473 std::string cow_path;
2474 if (!GetMappedImageDevicePath(cow_name, &cow_path)) {
2475 LOG(ERROR) << "Could not determine path for: " << cow_name;
2476 return false;
2477 }
2478 if (!WaitForDevice(cow_path, remaining_time)) {
2479 return false;
2480 }
2481
2482 auto name = GetDmUserCowName(params.GetPartitionName(), GetSnapshotDriver(lock));
2483
2484 std::string new_cow_device;
2485 if (!MapDmUserCow(lock, name, cow_path, source_device_path, base_path, remaining_time,
2486 &new_cow_device)) {
2487 LOG(ERROR) << "Could not map dm-user device for partition "
2488 << params.GetPartitionName();
2489 return false;
2490 }
2491 created_devices.EmplaceBack<AutoUnmapDevice>(&dm_, name);
2492
2493 remaining_time = GetRemainingTime(params.timeout_ms, begin);
2494 if (remaining_time.count() < 0) return false;
2495
2496 cow_device = new_cow_device;
2497 }
2498
2499 // For userspace snapshots, dm-user block device itself will act as a
2500 // snapshot device. There is one subtle difference - MapSnapshot will create
2501 // either snapshot target or snapshot-merge target based on the underlying
2502 // state of the snapshot device. If snapshot-merge target is created, merge
2503 // will immediately start in the kernel.
2504 //
2505 // This is no longer true with respect to userspace snapshots. When dm-user
2506 // block device is created, we just have the snapshots ready but daemon in
2507 // the user-space will not start the merge. We have to explicitly inform the
2508 // daemon to resume the merge. Check ProcessUpdateState() call stack.
2509 if (!UpdateUsesUserSnapshots(lock)) {
2510 std::string path;
2511 if (!MapSnapshot(lock, params.GetPartitionName(), base_device, cow_device, remaining_time,
2512 &path)) {
2513 LOG(ERROR) << "Could not map snapshot for partition: " << params.GetPartitionName();
2514 return false;
2515 }
2516 // No need to add params.GetPartitionName() to created_devices since it is immediately
2517 // released.
2518
2519 if (paths) {
2520 paths->snapshot_device = path;
2521 }
2522 LOG(INFO) << "Mapped " << params.GetPartitionName() << " as snapshot device at " << path;
2523 } else {
2524 LOG(INFO) << "Mapped " << params.GetPartitionName() << " as snapshot device at "
2525 << cow_device;
2526 }
2527
2528 created_devices.Release();
2529
2530 return true;
2531 }
2532
UnmapPartitionWithSnapshot(LockedFile * lock,const std::string & target_partition_name)2533 bool SnapshotManager::UnmapPartitionWithSnapshot(LockedFile* lock,
2534 const std::string& target_partition_name) {
2535 CHECK(lock);
2536
2537 if (!UnmapSnapshot(lock, target_partition_name)) {
2538 return false;
2539 }
2540
2541 if (!UnmapCowDevices(lock, target_partition_name)) {
2542 return false;
2543 }
2544
2545 auto base_name = GetBaseDeviceName(target_partition_name);
2546 if (!DeleteDeviceIfExists(base_name)) {
2547 LOG(ERROR) << "Cannot delete base device: " << base_name;
2548 return false;
2549 }
2550
2551 auto source_name = GetSourceDeviceName(target_partition_name);
2552 if (!DeleteDeviceIfExists(source_name)) {
2553 LOG(ERROR) << "Cannot delete source device: " << source_name;
2554 return false;
2555 }
2556
2557 LOG(INFO) << "Successfully unmapped snapshot " << target_partition_name;
2558
2559 return true;
2560 }
2561
MapCowDevices(LockedFile * lock,const CreateLogicalPartitionParams & params,const SnapshotStatus & snapshot_status,AutoDeviceList * created_devices,std::string * cow_name)2562 bool SnapshotManager::MapCowDevices(LockedFile* lock, const CreateLogicalPartitionParams& params,
2563 const SnapshotStatus& snapshot_status,
2564 AutoDeviceList* created_devices, std::string* cow_name) {
2565 CHECK(lock);
2566 CHECK(snapshot_status.cow_partition_size() + snapshot_status.cow_file_size() > 0);
2567 auto begin = std::chrono::steady_clock::now();
2568
2569 std::string partition_name = params.GetPartitionName();
2570 std::string cow_image_name = GetCowImageDeviceName(partition_name);
2571 *cow_name = GetCowName(partition_name);
2572
2573 // Map COW image if necessary.
2574 if (snapshot_status.cow_file_size() > 0) {
2575 if (!EnsureImageManager()) return false;
2576 auto remaining_time = GetRemainingTime(params.timeout_ms, begin);
2577 if (remaining_time.count() < 0) return false;
2578
2579 if (!MapCowImage(partition_name, remaining_time).has_value()) {
2580 LOG(ERROR) << "Could not map cow image for partition: " << partition_name;
2581 return false;
2582 }
2583 created_devices->EmplaceBack<AutoUnmapImage>(images_.get(), cow_image_name);
2584
2585 // If no COW partition exists, just return the image alone.
2586 if (snapshot_status.cow_partition_size() == 0) {
2587 *cow_name = std::move(cow_image_name);
2588 LOG(INFO) << "Mapped COW image for " << partition_name << " at " << *cow_name;
2589 return true;
2590 }
2591 }
2592
2593 auto remaining_time = GetRemainingTime(params.timeout_ms, begin);
2594 if (remaining_time.count() < 0) return false;
2595
2596 CHECK(snapshot_status.cow_partition_size() > 0);
2597
2598 // Create the DmTable for the COW device. It is the DmTable of the COW partition plus
2599 // COW image device as the last extent.
2600 CreateLogicalPartitionParams cow_partition_params = params;
2601 cow_partition_params.partition = nullptr;
2602 cow_partition_params.partition_name = *cow_name;
2603 cow_partition_params.device_name.clear();
2604 DmTable table;
2605 if (!CreateDmTable(cow_partition_params, &table)) {
2606 return false;
2607 }
2608 // If the COW image exists, append it as the last extent.
2609 if (snapshot_status.cow_file_size() > 0) {
2610 std::string cow_image_device;
2611 if (!GetMappedImageDeviceStringOrPath(cow_image_name, &cow_image_device)) {
2612 LOG(ERROR) << "Cannot determine major/minor for: " << cow_image_name;
2613 return false;
2614 }
2615 auto cow_partition_sectors = snapshot_status.cow_partition_size() / kSectorSize;
2616 auto cow_image_sectors = snapshot_status.cow_file_size() / kSectorSize;
2617 table.Emplace<DmTargetLinear>(cow_partition_sectors, cow_image_sectors, cow_image_device,
2618 0);
2619 }
2620
2621 // We have created the DmTable now. Map it.
2622 std::string cow_path;
2623 if (!dm_.CreateDevice(*cow_name, table, &cow_path, remaining_time)) {
2624 LOG(ERROR) << "Could not create COW device: " << *cow_name;
2625 return false;
2626 }
2627 created_devices->EmplaceBack<AutoUnmapDevice>(&dm_, *cow_name);
2628 LOG(INFO) << "Mapped COW device for " << params.GetPartitionName() << " at " << cow_path;
2629 return true;
2630 }
2631
UnmapCowDevices(LockedFile * lock,const std::string & name)2632 bool SnapshotManager::UnmapCowDevices(LockedFile* lock, const std::string& name) {
2633 CHECK(lock);
2634 if (!EnsureImageManager()) return false;
2635
2636 if (UpdateUsesCompression(lock) && !UpdateUsesUserSnapshots(lock)) {
2637 auto dm_user_name = GetDmUserCowName(name, GetSnapshotDriver(lock));
2638 if (!UnmapDmUserDevice(dm_user_name)) {
2639 return false;
2640 }
2641 }
2642
2643 if (!DeleteDeviceIfExists(GetCowName(name), 4000ms)) {
2644 LOG(ERROR) << "Cannot unmap: " << GetCowName(name);
2645 return false;
2646 }
2647
2648 std::string cow_image_name = GetCowImageDeviceName(name);
2649 if (!images_->UnmapImageIfExists(cow_image_name)) {
2650 LOG(ERROR) << "Cannot unmap image " << cow_image_name;
2651 return false;
2652 }
2653 return true;
2654 }
2655
UnmapDmUserDevice(const std::string & dm_user_name)2656 bool SnapshotManager::UnmapDmUserDevice(const std::string& dm_user_name) {
2657 if (dm_.GetState(dm_user_name) == DmDeviceState::INVALID) {
2658 return true;
2659 }
2660
2661 if (!DeleteDeviceIfExists(dm_user_name)) {
2662 LOG(ERROR) << "Cannot unmap " << dm_user_name;
2663 return false;
2664 }
2665
2666 if (EnsureSnapuserdConnected()) {
2667 if (!snapuserd_client_->WaitForDeviceDelete(dm_user_name)) {
2668 LOG(ERROR) << "Failed to wait for " << dm_user_name << " control device to delete";
2669 return false;
2670 }
2671 }
2672
2673 // Ensure the control device is gone so we don't run into ABA problems.
2674 auto control_device = "/dev/dm-user/" + dm_user_name;
2675 if (!android::fs_mgr::WaitForFileDeleted(control_device, 10s)) {
2676 LOG(ERROR) << "Timed out waiting for " << control_device << " to unlink";
2677 return false;
2678 }
2679 return true;
2680 }
2681
UnmapUserspaceSnapshotDevice(LockedFile * lock,const std::string & snapshot_name)2682 bool SnapshotManager::UnmapUserspaceSnapshotDevice(LockedFile* lock,
2683 const std::string& snapshot_name) {
2684 auto dm_user_name = GetDmUserCowName(snapshot_name, GetSnapshotDriver(lock));
2685 if (dm_.GetState(dm_user_name) == DmDeviceState::INVALID) {
2686 return true;
2687 }
2688
2689 CHECK(lock);
2690
2691 SnapshotStatus snapshot_status;
2692
2693 if (!ReadSnapshotStatus(lock, snapshot_name, &snapshot_status)) {
2694 return false;
2695 }
2696 // If the merge is complete, then we switch dm tables which is equivalent
2697 // to unmap; hence, we can't be deleting the device
2698 // as the table would be mounted off partitions and will fail.
2699 if (snapshot_status.state() != SnapshotState::MERGE_COMPLETED) {
2700 if (!DeleteDeviceIfExists(dm_user_name)) {
2701 LOG(ERROR) << "Cannot unmap " << dm_user_name;
2702 return false;
2703 }
2704 }
2705
2706 if (EnsureSnapuserdConnected()) {
2707 if (!snapuserd_client_->WaitForDeviceDelete(dm_user_name)) {
2708 LOG(ERROR) << "Failed to wait for " << dm_user_name << " control device to delete";
2709 return false;
2710 }
2711 }
2712
2713 // Ensure the control device is gone so we don't run into ABA problems.
2714 auto control_device = "/dev/dm-user/" + dm_user_name;
2715 if (!android::fs_mgr::WaitForFileDeleted(control_device, 10s)) {
2716 LOG(ERROR) << "Timed out waiting for " << control_device << " to unlink";
2717 return false;
2718 }
2719 return true;
2720 }
2721
MapAllSnapshots(const std::chrono::milliseconds & timeout_ms)2722 bool SnapshotManager::MapAllSnapshots(const std::chrono::milliseconds& timeout_ms) {
2723 auto lock = LockExclusive();
2724 if (!lock) return false;
2725
2726 auto state = ReadUpdateState(lock.get());
2727 if (state == UpdateState::Unverified) {
2728 if (GetCurrentSlot() == Slot::Target) {
2729 LOG(ERROR) << "Cannot call MapAllSnapshots when booting from the target slot.";
2730 return false;
2731 }
2732 } else if (state != UpdateState::Initiated) {
2733 LOG(ERROR) << "Cannot call MapAllSnapshots from update state: " << state;
2734 return false;
2735 }
2736
2737 std::vector<std::string> snapshots;
2738 if (!ListSnapshots(lock.get(), &snapshots)) {
2739 return false;
2740 }
2741
2742 const auto& opener = device_->GetPartitionOpener();
2743 auto slot_suffix = device_->GetOtherSlotSuffix();
2744 auto slot_number = SlotNumberForSlotSuffix(slot_suffix);
2745 auto super_device = device_->GetSuperDevice(slot_number);
2746 auto metadata = android::fs_mgr::ReadMetadata(opener, super_device, slot_number);
2747 if (!metadata) {
2748 LOG(ERROR) << "MapAllSnapshots could not read dynamic partition metadata for device: "
2749 << super_device;
2750 return false;
2751 }
2752
2753 for (const auto& snapshot : snapshots) {
2754 if (!UnmapPartitionWithSnapshot(lock.get(), snapshot)) {
2755 LOG(ERROR) << "MapAllSnapshots could not unmap snapshot: " << snapshot;
2756 return false;
2757 }
2758
2759 CreateLogicalPartitionParams params = {
2760 .block_device = super_device,
2761 .metadata = metadata.get(),
2762 .partition_name = snapshot,
2763 .timeout_ms = timeout_ms,
2764 .partition_opener = &opener,
2765 };
2766 if (!MapPartitionWithSnapshot(lock.get(), std::move(params), SnapshotContext::Mount,
2767 nullptr)) {
2768 LOG(ERROR) << "MapAllSnapshots failed to map: " << snapshot;
2769 return false;
2770 }
2771 }
2772
2773 LOG(INFO) << "MapAllSnapshots succeeded.";
2774 return true;
2775 }
2776
UnmapAllSnapshots()2777 bool SnapshotManager::UnmapAllSnapshots() {
2778 auto lock = LockExclusive();
2779 if (!lock) return false;
2780
2781 return UnmapAllSnapshots(lock.get());
2782 }
2783
UnmapAllSnapshots(LockedFile * lock)2784 bool SnapshotManager::UnmapAllSnapshots(LockedFile* lock) {
2785 std::vector<std::string> snapshots;
2786 if (!ListSnapshots(lock, &snapshots)) {
2787 return false;
2788 }
2789
2790 for (const auto& snapshot : snapshots) {
2791 if (!UnmapPartitionWithSnapshot(lock, snapshot)) {
2792 LOG(ERROR) << "Failed to unmap snapshot: " << snapshot;
2793 return false;
2794 }
2795 }
2796
2797 // Terminate the daemon and release the snapuserd_client_ object.
2798 // If we need to re-connect with the daemon, EnsureSnapuserdConnected()
2799 // will re-create the object and establish the socket connection.
2800 if (snapuserd_client_) {
2801 LOG(INFO) << "Shutdown snapuserd daemon";
2802 snapuserd_client_->DetachSnapuserd();
2803 snapuserd_client_ = nullptr;
2804 }
2805
2806 return true;
2807 }
2808
OpenFile(const std::string & file,int lock_flags)2809 auto SnapshotManager::OpenFile(const std::string& file, int lock_flags)
2810 -> std::unique_ptr<LockedFile> {
2811 unique_fd fd(open(file.c_str(), O_RDONLY | O_CLOEXEC | O_NOFOLLOW));
2812 if (fd < 0) {
2813 PLOG(ERROR) << "Open failed: " << file;
2814 return nullptr;
2815 }
2816 if (lock_flags != 0 && TEMP_FAILURE_RETRY(flock(fd, lock_flags)) < 0) {
2817 PLOG(ERROR) << "Acquire flock failed: " << file;
2818 return nullptr;
2819 }
2820 // For simplicity, we want to CHECK that lock_mode == LOCK_EX, in some
2821 // calls, so strip extra flags.
2822 int lock_mode = lock_flags & (LOCK_EX | LOCK_SH);
2823 return std::make_unique<LockedFile>(file, std::move(fd), lock_mode);
2824 }
2825
~LockedFile()2826 SnapshotManager::LockedFile::~LockedFile() {
2827 if (TEMP_FAILURE_RETRY(flock(fd_, LOCK_UN)) < 0) {
2828 PLOG(ERROR) << "Failed to unlock file: " << path_;
2829 }
2830 }
2831
GetStateFilePath() const2832 std::string SnapshotManager::GetStateFilePath() const {
2833 return metadata_dir_ + "/state"s;
2834 }
2835
GetMergeStateFilePath() const2836 std::string SnapshotManager::GetMergeStateFilePath() const {
2837 return metadata_dir_ + "/merge_state"s;
2838 }
2839
GetLockPath() const2840 std::string SnapshotManager::GetLockPath() const {
2841 return metadata_dir_;
2842 }
2843
OpenLock(int lock_flags)2844 std::unique_ptr<SnapshotManager::LockedFile> SnapshotManager::OpenLock(int lock_flags) {
2845 auto lock_file = GetLockPath();
2846 return OpenFile(lock_file, lock_flags);
2847 }
2848
LockShared()2849 std::unique_ptr<SnapshotManager::LockedFile> SnapshotManager::LockShared() {
2850 return OpenLock(LOCK_SH);
2851 }
2852
LockExclusive()2853 std::unique_ptr<SnapshotManager::LockedFile> SnapshotManager::LockExclusive() {
2854 return OpenLock(LOCK_EX);
2855 }
2856
UpdateStateFromString(const std::string & contents)2857 static UpdateState UpdateStateFromString(const std::string& contents) {
2858 if (contents.empty() || contents == "none") {
2859 return UpdateState::None;
2860 } else if (contents == "initiated") {
2861 return UpdateState::Initiated;
2862 } else if (contents == "unverified") {
2863 return UpdateState::Unverified;
2864 } else if (contents == "merging") {
2865 return UpdateState::Merging;
2866 } else if (contents == "merge-completed") {
2867 return UpdateState::MergeCompleted;
2868 } else if (contents == "merge-needs-reboot") {
2869 return UpdateState::MergeNeedsReboot;
2870 } else if (contents == "merge-failed") {
2871 return UpdateState::MergeFailed;
2872 } else if (contents == "cancelled") {
2873 return UpdateState::Cancelled;
2874 } else {
2875 LOG(ERROR) << "Unknown merge state in update state file: \"" << contents << "\"";
2876 return UpdateState::None;
2877 }
2878 }
2879
operator <<(std::ostream & os,UpdateState state)2880 std::ostream& operator<<(std::ostream& os, UpdateState state) {
2881 switch (state) {
2882 case UpdateState::None:
2883 return os << "none";
2884 case UpdateState::Initiated:
2885 return os << "initiated";
2886 case UpdateState::Unverified:
2887 return os << "unverified";
2888 case UpdateState::Merging:
2889 return os << "merging";
2890 case UpdateState::MergeCompleted:
2891 return os << "merge-completed";
2892 case UpdateState::MergeNeedsReboot:
2893 return os << "merge-needs-reboot";
2894 case UpdateState::MergeFailed:
2895 return os << "merge-failed";
2896 case UpdateState::Cancelled:
2897 return os << "cancelled";
2898 default:
2899 LOG(ERROR) << "Unknown update state: " << static_cast<uint32_t>(state);
2900 return os;
2901 }
2902 }
2903
operator <<(std::ostream & os,MergePhase phase)2904 std::ostream& operator<<(std::ostream& os, MergePhase phase) {
2905 switch (phase) {
2906 case MergePhase::NO_MERGE:
2907 return os << "none";
2908 case MergePhase::FIRST_PHASE:
2909 return os << "first";
2910 case MergePhase::SECOND_PHASE:
2911 return os << "second";
2912 default:
2913 LOG(ERROR) << "Unknown merge phase: " << static_cast<uint32_t>(phase);
2914 return os << "unknown(" << static_cast<uint32_t>(phase) << ")";
2915 }
2916 }
2917
ReadUpdateState(LockedFile * lock)2918 UpdateState SnapshotManager::ReadUpdateState(LockedFile* lock) {
2919 SnapshotUpdateStatus status = ReadSnapshotUpdateStatus(lock);
2920 return status.state();
2921 }
2922
ReadSnapshotUpdateStatus(LockedFile * lock)2923 SnapshotUpdateStatus SnapshotManager::ReadSnapshotUpdateStatus(LockedFile* lock) {
2924 CHECK(lock);
2925
2926 SnapshotUpdateStatus status = {};
2927 std::string contents;
2928 if (!android::base::ReadFileToString(GetStateFilePath(), &contents)) {
2929 PLOG(ERROR) << "Read state file failed";
2930 status.set_state(UpdateState::None);
2931 return status;
2932 }
2933
2934 if (!status.ParseFromString(contents)) {
2935 LOG(WARNING) << "Unable to parse state file as SnapshotUpdateStatus, using the old format";
2936
2937 // Try to rollback to legacy file to support devices that are
2938 // currently using the old file format.
2939 // TODO(b/147409432)
2940 status.set_state(UpdateStateFromString(contents));
2941 }
2942
2943 return status;
2944 }
2945
WriteUpdateState(LockedFile * lock,UpdateState state,MergeFailureCode failure_code)2946 bool SnapshotManager::WriteUpdateState(LockedFile* lock, UpdateState state,
2947 MergeFailureCode failure_code) {
2948 SnapshotUpdateStatus status;
2949 status.set_state(state);
2950
2951 switch (state) {
2952 case UpdateState::MergeFailed:
2953 status.set_merge_failure_code(failure_code);
2954 break;
2955 case UpdateState::Initiated:
2956 status.set_source_build_fingerprint(
2957 android::base::GetProperty("ro.build.fingerprint", ""));
2958 break;
2959 default:
2960 break;
2961 }
2962
2963 // If we're transitioning between two valid states (eg, we're not beginning
2964 // or ending an OTA), then make sure to propagate the compression bit and
2965 // build fingerprint.
2966 if (!(state == UpdateState::Initiated || state == UpdateState::None)) {
2967 SnapshotUpdateStatus old_status = ReadSnapshotUpdateStatus(lock);
2968 status.set_using_snapuserd(old_status.using_snapuserd());
2969 status.set_source_build_fingerprint(old_status.source_build_fingerprint());
2970 status.set_merge_phase(old_status.merge_phase());
2971 status.set_userspace_snapshots(old_status.userspace_snapshots());
2972 status.set_io_uring_enabled(old_status.io_uring_enabled());
2973 }
2974 return WriteSnapshotUpdateStatus(lock, status);
2975 }
2976
WriteSnapshotUpdateStatus(LockedFile * lock,const SnapshotUpdateStatus & status)2977 bool SnapshotManager::WriteSnapshotUpdateStatus(LockedFile* lock,
2978 const SnapshotUpdateStatus& status) {
2979 CHECK(lock);
2980 CHECK(lock->lock_mode() == LOCK_EX);
2981
2982 std::string contents;
2983 if (!status.SerializeToString(&contents)) {
2984 LOG(ERROR) << "Unable to serialize SnapshotUpdateStatus.";
2985 return false;
2986 }
2987
2988 #ifdef LIBSNAPSHOT_USE_HAL
2989 auto merge_status = MergeStatus::UNKNOWN;
2990 switch (status.state()) {
2991 // The needs-reboot and completed cases imply that /data and /metadata
2992 // can be safely wiped, so we don't report a merge status.
2993 case UpdateState::None:
2994 case UpdateState::MergeNeedsReboot:
2995 case UpdateState::MergeCompleted:
2996 case UpdateState::Initiated:
2997 merge_status = MergeStatus::NONE;
2998 break;
2999 case UpdateState::Unverified:
3000 merge_status = MergeStatus::SNAPSHOTTED;
3001 break;
3002 case UpdateState::Merging:
3003 case UpdateState::MergeFailed:
3004 merge_status = MergeStatus::MERGING;
3005 break;
3006 default:
3007 // Note that Cancelled flows to here - it is never written, since
3008 // it only communicates a transient state to the caller.
3009 LOG(ERROR) << "Unexpected update status: " << status.state();
3010 break;
3011 }
3012
3013 bool set_before_write =
3014 merge_status == MergeStatus::SNAPSHOTTED || merge_status == MergeStatus::MERGING;
3015 if (set_before_write && !device_->SetBootControlMergeStatus(merge_status)) {
3016 return false;
3017 }
3018 #endif
3019
3020 if (!WriteStringToFileAtomic(contents, GetStateFilePath())) {
3021 PLOG(ERROR) << "Could not write to state file";
3022 return false;
3023 }
3024
3025 #ifdef LIBSNAPSHOT_USE_HAL
3026 if (!set_before_write && !device_->SetBootControlMergeStatus(merge_status)) {
3027 return false;
3028 }
3029 #endif
3030 return true;
3031 }
3032
GetSnapshotStatusFilePath(const std::string & name)3033 std::string SnapshotManager::GetSnapshotStatusFilePath(const std::string& name) {
3034 auto file = metadata_dir_ + "/snapshots/"s + name;
3035 return file;
3036 }
3037
ReadSnapshotStatus(LockedFile * lock,const std::string & name,SnapshotStatus * status)3038 bool SnapshotManager::ReadSnapshotStatus(LockedFile* lock, const std::string& name,
3039 SnapshotStatus* status) {
3040 CHECK(lock);
3041 auto path = GetSnapshotStatusFilePath(name);
3042
3043 unique_fd fd(open(path.c_str(), O_RDONLY | O_CLOEXEC | O_NOFOLLOW));
3044 if (fd < 0) {
3045 PLOG(ERROR) << "Open failed: " << path;
3046 return false;
3047 }
3048
3049 if (!status->ParseFromFileDescriptor(fd.get())) {
3050 PLOG(ERROR) << "Unable to parse " << path << " as SnapshotStatus";
3051 return false;
3052 }
3053
3054 if (status->name() != name) {
3055 LOG(WARNING) << "Found snapshot status named " << status->name() << " in " << path;
3056 status->set_name(name);
3057 }
3058
3059 return true;
3060 }
3061
WriteSnapshotStatus(LockedFile * lock,const SnapshotStatus & status)3062 bool SnapshotManager::WriteSnapshotStatus(LockedFile* lock, const SnapshotStatus& status) {
3063 // The caller must take an exclusive lock to modify snapshots.
3064 CHECK(lock);
3065 CHECK(lock->lock_mode() == LOCK_EX);
3066 CHECK(!status.name().empty());
3067
3068 auto path = GetSnapshotStatusFilePath(status.name());
3069
3070 std::string content;
3071 if (!status.SerializeToString(&content)) {
3072 LOG(ERROR) << "Unable to serialize SnapshotStatus for " << status.name();
3073 return false;
3074 }
3075
3076 if (!WriteStringToFileAtomic(content, path)) {
3077 PLOG(ERROR) << "Unable to write SnapshotStatus to " << path;
3078 return false;
3079 }
3080
3081 return true;
3082 }
3083
EnsureImageManager()3084 bool SnapshotManager::EnsureImageManager() {
3085 if (images_) return true;
3086
3087 images_ = device_->OpenImageManager();
3088 if (!images_) {
3089 LOG(ERROR) << "Could not open ImageManager";
3090 return false;
3091 }
3092 return true;
3093 }
3094
EnsureSnapuserdConnected()3095 bool SnapshotManager::EnsureSnapuserdConnected() {
3096 if (snapuserd_client_) {
3097 return true;
3098 }
3099
3100 if (!use_first_stage_snapuserd_ && !EnsureSnapuserdStarted()) {
3101 return false;
3102 }
3103
3104 snapuserd_client_ = SnapuserdClient::Connect(kSnapuserdSocket, 10s);
3105 if (!snapuserd_client_) {
3106 LOG(ERROR) << "Unable to connect to snapuserd";
3107 return false;
3108 }
3109 return true;
3110 }
3111
UnmapAndDeleteCowPartition(MetadataBuilder * current_metadata)3112 void SnapshotManager::UnmapAndDeleteCowPartition(MetadataBuilder* current_metadata) {
3113 std::vector<std::string> to_delete;
3114 for (auto* existing_cow_partition : current_metadata->ListPartitionsInGroup(kCowGroupName)) {
3115 if (!DeleteDeviceIfExists(existing_cow_partition->name())) {
3116 LOG(WARNING) << existing_cow_partition->name()
3117 << " cannot be unmapped and its space cannot be reclaimed";
3118 continue;
3119 }
3120 to_delete.push_back(existing_cow_partition->name());
3121 }
3122 for (const auto& name : to_delete) {
3123 current_metadata->RemovePartition(name);
3124 }
3125 }
3126
AddRequiredSpace(Return orig,const std::map<std::string,SnapshotStatus> & all_snapshot_status)3127 static Return AddRequiredSpace(Return orig,
3128 const std::map<std::string, SnapshotStatus>& all_snapshot_status) {
3129 if (orig.error_code() != Return::ErrorCode::NO_SPACE) {
3130 return orig;
3131 }
3132 uint64_t sum = 0;
3133 for (auto&& [name, status] : all_snapshot_status) {
3134 sum += status.cow_file_size();
3135 }
3136 return Return::NoSpace(sum);
3137 }
3138
CreateUpdateSnapshots(const DeltaArchiveManifest & manifest)3139 Return SnapshotManager::CreateUpdateSnapshots(const DeltaArchiveManifest& manifest) {
3140 auto lock = LockExclusive();
3141 if (!lock) return Return::Error();
3142
3143 auto update_state = ReadUpdateState(lock.get());
3144 if (update_state != UpdateState::Initiated) {
3145 LOG(ERROR) << "Cannot create update snapshots in state " << update_state;
3146 return Return::Error();
3147 }
3148
3149 // TODO(b/134949511): remove this check. Right now, with overlayfs mounted, the scratch
3150 // partition takes up a big chunk of space in super, causing COW images to be created on
3151 // retrofit Virtual A/B devices.
3152 if (device_->IsOverlayfsSetup()) {
3153 LOG(ERROR) << "Cannot create update snapshots with overlayfs setup. Run `adb enable-verity`"
3154 << ", reboot, then try again.";
3155 return Return::Error();
3156 }
3157
3158 const auto& opener = device_->GetPartitionOpener();
3159 auto current_suffix = device_->GetSlotSuffix();
3160 uint32_t current_slot = SlotNumberForSlotSuffix(current_suffix);
3161 auto target_suffix = device_->GetOtherSlotSuffix();
3162 uint32_t target_slot = SlotNumberForSlotSuffix(target_suffix);
3163 auto current_super = device_->GetSuperDevice(current_slot);
3164
3165 auto current_metadata = MetadataBuilder::New(opener, current_super, current_slot);
3166 if (current_metadata == nullptr) {
3167 LOG(ERROR) << "Cannot create metadata builder.";
3168 return Return::Error();
3169 }
3170
3171 auto target_metadata =
3172 MetadataBuilder::NewForUpdate(opener, current_super, current_slot, target_slot);
3173 if (target_metadata == nullptr) {
3174 LOG(ERROR) << "Cannot create target metadata builder.";
3175 return Return::Error();
3176 }
3177
3178 // Delete partitions with target suffix in |current_metadata|. Otherwise,
3179 // partition_cow_creator recognizes these left-over partitions as used space.
3180 for (const auto& group_name : current_metadata->ListGroups()) {
3181 if (android::base::EndsWith(group_name, target_suffix)) {
3182 current_metadata->RemoveGroupAndPartitions(group_name);
3183 }
3184 }
3185
3186 SnapshotMetadataUpdater metadata_updater(target_metadata.get(), target_slot, manifest);
3187 if (!metadata_updater.Update()) {
3188 LOG(ERROR) << "Cannot calculate new metadata.";
3189 return Return::Error();
3190 }
3191
3192 // Delete previous COW partitions in current_metadata so that PartitionCowCreator marks those as
3193 // free regions.
3194 UnmapAndDeleteCowPartition(current_metadata.get());
3195
3196 // Check that all these metadata is not retrofit dynamic partitions. Snapshots on
3197 // devices with retrofit dynamic partitions does not make sense.
3198 // This ensures that current_metadata->GetFreeRegions() uses the same device
3199 // indices as target_metadata (i.e. 0 -> "super").
3200 // This is also assumed in MapCowDevices() call below.
3201 CHECK(current_metadata->GetBlockDevicePartitionName(0) == LP_METADATA_DEFAULT_PARTITION_NAME &&
3202 target_metadata->GetBlockDevicePartitionName(0) == LP_METADATA_DEFAULT_PARTITION_NAME);
3203
3204 std::map<std::string, SnapshotStatus> all_snapshot_status;
3205
3206 // In case of error, automatically delete devices that are created along the way.
3207 // Note that "lock" is destroyed after "created_devices", so it is safe to use |lock| for
3208 // these devices.
3209 AutoDeviceList created_devices;
3210
3211 const auto& dap_metadata = manifest.dynamic_partition_metadata();
3212 CowOptions options;
3213 CowWriter writer(options);
3214 bool cow_format_support = true;
3215 if (dap_metadata.cow_version() < writer.GetCowVersion()) {
3216 cow_format_support = false;
3217 }
3218
3219 LOG(INFO) << " dap_metadata.cow_version(): " << dap_metadata.cow_version()
3220 << " writer.GetCowVersion(): " << writer.GetCowVersion();
3221
3222 // Deduce supported features.
3223 bool userspace_snapshots = CanUseUserspaceSnapshots();
3224 bool legacy_compression = GetLegacyCompressionEnabledProperty();
3225
3226 std::string vabc_disable_reason;
3227 if (!dap_metadata.vabc_enabled()) {
3228 vabc_disable_reason = "not enabled metadata";
3229 } else if (device_->IsRecovery()) {
3230 vabc_disable_reason = "recovery";
3231 } else if (!cow_format_support) {
3232 vabc_disable_reason = "cow format not supported";
3233 } else if (!KernelSupportsCompressedSnapshots()) {
3234 vabc_disable_reason = "kernel missing userspace block device support";
3235 }
3236
3237 if (!vabc_disable_reason.empty()) {
3238 if (userspace_snapshots) {
3239 LOG(INFO) << "Userspace snapshots disabled: " << vabc_disable_reason;
3240 }
3241 if (legacy_compression) {
3242 LOG(INFO) << "Compression disabled: " << vabc_disable_reason;
3243 }
3244 userspace_snapshots = false;
3245 legacy_compression = false;
3246 }
3247
3248 const bool using_snapuserd = userspace_snapshots || legacy_compression;
3249 if (!using_snapuserd) {
3250 LOG(INFO) << "Using legacy Virtual A/B (dm-snapshot)";
3251 }
3252
3253 std::string compression_algorithm;
3254 if (using_snapuserd) {
3255 compression_algorithm = dap_metadata.vabc_compression_param();
3256 if (compression_algorithm.empty()) {
3257 // Older OTAs don't set an explicit compression type, so default to gz.
3258 compression_algorithm = "gz";
3259 }
3260 }
3261
3262 PartitionCowCreator cow_creator{
3263 .target_metadata = target_metadata.get(),
3264 .target_suffix = target_suffix,
3265 .target_partition = nullptr,
3266 .current_metadata = current_metadata.get(),
3267 .current_suffix = current_suffix,
3268 .update = nullptr,
3269 .extra_extents = {},
3270 .using_snapuserd = using_snapuserd,
3271 .compression_algorithm = compression_algorithm,
3272 };
3273 if (dap_metadata.vabc_feature_set().has_threaded()) {
3274 cow_creator.enable_threading = dap_metadata.vabc_feature_set().threaded();
3275 }
3276 if (dap_metadata.vabc_feature_set().has_batch_writes()) {
3277 cow_creator.batched_writes = dap_metadata.vabc_feature_set().batch_writes();
3278 }
3279
3280 auto ret = CreateUpdateSnapshotsInternal(lock.get(), manifest, &cow_creator, &created_devices,
3281 &all_snapshot_status);
3282 if (!ret.is_ok()) return ret;
3283
3284 auto exported_target_metadata = target_metadata->Export();
3285 if (exported_target_metadata == nullptr) {
3286 LOG(ERROR) << "Cannot export target metadata";
3287 return Return::Error();
3288 }
3289
3290 ret = InitializeUpdateSnapshots(lock.get(), target_metadata.get(),
3291 exported_target_metadata.get(), target_suffix,
3292 all_snapshot_status);
3293 if (!ret.is_ok()) return ret;
3294
3295 if (!UpdatePartitionTable(opener, device_->GetSuperDevice(target_slot),
3296 *exported_target_metadata, target_slot)) {
3297 LOG(ERROR) << "Cannot write target metadata";
3298 return Return::Error();
3299 }
3300
3301 // If snapuserd is enabled, we need to retain a copy of the old metadata
3302 // so we can access original blocks in case they are moved around. We do
3303 // not want to rely on the old super metadata slot because we don't
3304 // guarantee its validity after the slot switch is successful.
3305 if (using_snapuserd) {
3306 auto metadata = current_metadata->Export();
3307 if (!metadata) {
3308 LOG(ERROR) << "Could not export current metadata";
3309 return Return::Error();
3310 }
3311
3312 auto path = GetOldPartitionMetadataPath();
3313 if (!android::fs_mgr::WriteToImageFile(path, *metadata.get())) {
3314 LOG(ERROR) << "Cannot write old metadata to " << path;
3315 return Return::Error();
3316 }
3317 }
3318
3319 SnapshotUpdateStatus status = ReadSnapshotUpdateStatus(lock.get());
3320 status.set_state(update_state);
3321 status.set_using_snapuserd(using_snapuserd);
3322
3323 if (userspace_snapshots) {
3324 status.set_userspace_snapshots(true);
3325 LOG(INFO) << "Virtual A/B using userspace snapshots";
3326
3327 if (GetIouringEnabledProperty()) {
3328 status.set_io_uring_enabled(true);
3329 LOG(INFO) << "io_uring for snapshots enabled";
3330 }
3331 } else if (legacy_compression) {
3332 LOG(INFO) << "Virtual A/B using legacy snapuserd";
3333 } else {
3334 LOG(INFO) << "Virtual A/B using dm-snapshot";
3335 }
3336
3337 is_snapshot_userspace_.emplace(userspace_snapshots);
3338
3339 if (!device()->IsTestDevice() && using_snapuserd) {
3340 // Terminate stale daemon if any
3341 std::unique_ptr<SnapuserdClient> snapuserd_client = std::move(snapuserd_client_);
3342 if (!snapuserd_client) {
3343 snapuserd_client = SnapuserdClient::Connect(kSnapuserdSocket, 5s);
3344 }
3345 if (snapuserd_client) {
3346 snapuserd_client->DetachSnapuserd();
3347 snapuserd_client = nullptr;
3348 }
3349 }
3350
3351 if (!WriteSnapshotUpdateStatus(lock.get(), status)) {
3352 LOG(ERROR) << "Unable to write new update state";
3353 return Return::Error();
3354 }
3355
3356 created_devices.Release();
3357 LOG(INFO) << "Successfully created all snapshots for target slot " << target_suffix;
3358
3359 return Return::Ok();
3360 }
3361
CreateUpdateSnapshotsInternal(LockedFile * lock,const DeltaArchiveManifest & manifest,PartitionCowCreator * cow_creator,AutoDeviceList * created_devices,std::map<std::string,SnapshotStatus> * all_snapshot_status)3362 Return SnapshotManager::CreateUpdateSnapshotsInternal(
3363 LockedFile* lock, const DeltaArchiveManifest& manifest, PartitionCowCreator* cow_creator,
3364 AutoDeviceList* created_devices,
3365 std::map<std::string, SnapshotStatus>* all_snapshot_status) {
3366 CHECK(lock);
3367
3368 auto* target_metadata = cow_creator->target_metadata;
3369 const auto& target_suffix = cow_creator->target_suffix;
3370
3371 if (!target_metadata->AddGroup(kCowGroupName, 0)) {
3372 LOG(ERROR) << "Cannot add group " << kCowGroupName;
3373 return Return::Error();
3374 }
3375
3376 std::map<std::string, const PartitionUpdate*> partition_map;
3377 std::map<std::string, std::vector<Extent>> extra_extents_map;
3378 for (const auto& partition_update : manifest.partitions()) {
3379 auto suffixed_name = partition_update.partition_name() + target_suffix;
3380 auto&& [it, inserted] = partition_map.emplace(suffixed_name, &partition_update);
3381 if (!inserted) {
3382 LOG(ERROR) << "Duplicated partition " << partition_update.partition_name()
3383 << " in update manifest.";
3384 return Return::Error();
3385 }
3386
3387 auto& extra_extents = extra_extents_map[suffixed_name];
3388 if (partition_update.has_hash_tree_extent()) {
3389 extra_extents.push_back(partition_update.hash_tree_extent());
3390 }
3391 if (partition_update.has_fec_extent()) {
3392 extra_extents.push_back(partition_update.fec_extent());
3393 }
3394 }
3395
3396 for (auto* target_partition : ListPartitionsWithSuffix(target_metadata, target_suffix)) {
3397 cow_creator->target_partition = target_partition;
3398 cow_creator->update = nullptr;
3399 auto iter = partition_map.find(target_partition->name());
3400 if (iter != partition_map.end()) {
3401 cow_creator->update = iter->second;
3402 } else {
3403 LOG(INFO) << target_partition->name()
3404 << " isn't included in the payload, skipping the cow creation.";
3405 continue;
3406 }
3407
3408 cow_creator->extra_extents.clear();
3409 auto extra_extents_it = extra_extents_map.find(target_partition->name());
3410 if (extra_extents_it != extra_extents_map.end()) {
3411 cow_creator->extra_extents = std::move(extra_extents_it->second);
3412 }
3413
3414 // Compute the device sizes for the partition.
3415 auto cow_creator_ret = cow_creator->Run();
3416 if (!cow_creator_ret.has_value()) {
3417 LOG(ERROR) << "PartitionCowCreator returned no value for " << target_partition->name();
3418 return Return::Error();
3419 }
3420
3421 LOG(INFO) << "For partition " << target_partition->name()
3422 << ", device size = " << cow_creator_ret->snapshot_status.device_size()
3423 << ", snapshot size = " << cow_creator_ret->snapshot_status.snapshot_size()
3424 << ", cow partition size = "
3425 << cow_creator_ret->snapshot_status.cow_partition_size()
3426 << ", cow file size = " << cow_creator_ret->snapshot_status.cow_file_size();
3427
3428 // Delete any existing snapshot before re-creating one.
3429 if (!DeleteSnapshot(lock, target_partition->name())) {
3430 LOG(ERROR) << "Cannot delete existing snapshot before creating a new one for partition "
3431 << target_partition->name();
3432 return Return::Error();
3433 }
3434
3435 // It is possible that the whole partition uses free space in super, and snapshot / COW
3436 // would not be needed. In this case, skip the partition.
3437 bool needs_snapshot = cow_creator_ret->snapshot_status.snapshot_size() > 0;
3438 bool needs_cow = (cow_creator_ret->snapshot_status.cow_partition_size() +
3439 cow_creator_ret->snapshot_status.cow_file_size()) > 0;
3440 CHECK(needs_snapshot == needs_cow);
3441
3442 if (!needs_snapshot) {
3443 LOG(INFO) << "Skip creating snapshot for partition " << target_partition->name()
3444 << "because nothing needs to be snapshotted.";
3445 continue;
3446 }
3447
3448 // Find the original partition size.
3449 auto name = target_partition->name();
3450 auto old_partition_name =
3451 name.substr(0, name.size() - target_suffix.size()) + cow_creator->current_suffix;
3452 auto old_partition = cow_creator->current_metadata->FindPartition(old_partition_name);
3453 if (old_partition) {
3454 cow_creator_ret->snapshot_status.set_old_partition_size(old_partition->size());
3455 }
3456
3457 // Store these device sizes to snapshot status file.
3458 if (!CreateSnapshot(lock, cow_creator, &cow_creator_ret->snapshot_status)) {
3459 return Return::Error();
3460 }
3461 created_devices->EmplaceBack<AutoDeleteSnapshot>(this, lock, target_partition->name());
3462
3463 // Create the COW partition. That is, use any remaining free space in super partition before
3464 // creating the COW images.
3465 if (cow_creator_ret->snapshot_status.cow_partition_size() > 0) {
3466 CHECK(cow_creator_ret->snapshot_status.cow_partition_size() % kSectorSize == 0)
3467 << "cow_partition_size == "
3468 << cow_creator_ret->snapshot_status.cow_partition_size()
3469 << " is not a multiple of sector size " << kSectorSize;
3470 auto cow_partition = target_metadata->AddPartition(GetCowName(target_partition->name()),
3471 kCowGroupName, 0 /* flags */);
3472 if (cow_partition == nullptr) {
3473 return Return::Error();
3474 }
3475
3476 if (!target_metadata->ResizePartition(
3477 cow_partition, cow_creator_ret->snapshot_status.cow_partition_size(),
3478 cow_creator_ret->cow_partition_usable_regions)) {
3479 LOG(ERROR) << "Cannot create COW partition on metadata with size "
3480 << cow_creator_ret->snapshot_status.cow_partition_size();
3481 return Return::Error();
3482 }
3483 // Only the in-memory target_metadata is modified; nothing to clean up if there is an
3484 // error in the future.
3485 }
3486
3487 all_snapshot_status->emplace(target_partition->name(),
3488 std::move(cow_creator_ret->snapshot_status));
3489
3490 LOG(INFO) << "Successfully created snapshot partition for " << target_partition->name();
3491 }
3492
3493 LOG(INFO) << "Allocating CoW images.";
3494
3495 for (auto&& [name, snapshot_status] : *all_snapshot_status) {
3496 // Create the backing COW image if necessary.
3497 if (snapshot_status.cow_file_size() > 0) {
3498 auto ret = CreateCowImage(lock, name);
3499 if (!ret.is_ok()) return AddRequiredSpace(ret, *all_snapshot_status);
3500 }
3501
3502 LOG(INFO) << "Successfully created snapshot for " << name;
3503 }
3504
3505 return Return::Ok();
3506 }
3507
InitializeUpdateSnapshots(LockedFile * lock,MetadataBuilder * target_metadata,const LpMetadata * exported_target_metadata,const std::string & target_suffix,const std::map<std::string,SnapshotStatus> & all_snapshot_status)3508 Return SnapshotManager::InitializeUpdateSnapshots(
3509 LockedFile* lock, MetadataBuilder* target_metadata,
3510 const LpMetadata* exported_target_metadata, const std::string& target_suffix,
3511 const std::map<std::string, SnapshotStatus>& all_snapshot_status) {
3512 CHECK(lock);
3513
3514 CreateLogicalPartitionParams cow_params{
3515 .block_device = LP_METADATA_DEFAULT_PARTITION_NAME,
3516 .metadata = exported_target_metadata,
3517 .timeout_ms = std::chrono::milliseconds::max(),
3518 .partition_opener = &device_->GetPartitionOpener(),
3519 };
3520 for (auto* target_partition : ListPartitionsWithSuffix(target_metadata, target_suffix)) {
3521 AutoDeviceList created_devices_for_cow;
3522
3523 if (!UnmapPartitionWithSnapshot(lock, target_partition->name())) {
3524 LOG(ERROR) << "Cannot unmap existing COW devices before re-mapping them for zero-fill: "
3525 << target_partition->name();
3526 return Return::Error();
3527 }
3528
3529 auto it = all_snapshot_status.find(target_partition->name());
3530 if (it == all_snapshot_status.end()) continue;
3531 cow_params.partition_name = target_partition->name();
3532 std::string cow_name;
3533 if (!MapCowDevices(lock, cow_params, it->second, &created_devices_for_cow, &cow_name)) {
3534 return Return::Error();
3535 }
3536
3537 std::string cow_path;
3538 if (!images_->GetMappedImageDevice(cow_name, &cow_path)) {
3539 LOG(ERROR) << "Cannot determine path for " << cow_name;
3540 return Return::Error();
3541 }
3542
3543 if (it->second.using_snapuserd()) {
3544 unique_fd fd(open(cow_path.c_str(), O_RDWR | O_CLOEXEC));
3545 if (fd < 0) {
3546 PLOG(ERROR) << "open " << cow_path << " failed for snapshot "
3547 << cow_params.partition_name;
3548 return Return::Error();
3549 }
3550
3551 CowOptions options;
3552 if (device()->IsTestDevice()) {
3553 options.scratch_space = false;
3554 }
3555 options.compression = it->second.compression_algorithm();
3556
3557 CowWriter writer(options);
3558 if (!writer.Initialize(fd) || !writer.Finalize()) {
3559 LOG(ERROR) << "Could not initialize COW device for " << target_partition->name();
3560 return Return::Error();
3561 }
3562 } else {
3563 auto ret = InitializeKernelCow(cow_path);
3564 if (!ret.is_ok()) {
3565 LOG(ERROR) << "Can't zero-fill COW device for " << target_partition->name() << ": "
3566 << cow_path;
3567 return AddRequiredSpace(ret, all_snapshot_status);
3568 }
3569 }
3570 // Let destructor of created_devices_for_cow to unmap the COW devices.
3571 };
3572 return Return::Ok();
3573 }
3574
MapUpdateSnapshot(const CreateLogicalPartitionParams & params,std::string * snapshot_path)3575 bool SnapshotManager::MapUpdateSnapshot(const CreateLogicalPartitionParams& params,
3576 std::string* snapshot_path) {
3577 auto lock = LockShared();
3578 if (!lock) return false;
3579 if (!UnmapPartitionWithSnapshot(lock.get(), params.GetPartitionName())) {
3580 LOG(ERROR) << "Cannot unmap existing snapshot before re-mapping it: "
3581 << params.GetPartitionName();
3582 return false;
3583 }
3584
3585 SnapshotStatus status;
3586 if (!ReadSnapshotStatus(lock.get(), params.GetPartitionName(), &status)) {
3587 return false;
3588 }
3589 if (status.using_snapuserd()) {
3590 LOG(ERROR) << "Cannot use MapUpdateSnapshot with snapuserd";
3591 return false;
3592 }
3593
3594 SnapshotPaths paths;
3595 if (!MapPartitionWithSnapshot(lock.get(), params, SnapshotContext::Update, &paths)) {
3596 return false;
3597 }
3598
3599 if (!paths.snapshot_device.empty()) {
3600 *snapshot_path = paths.snapshot_device;
3601 } else {
3602 *snapshot_path = paths.target_device;
3603 }
3604 DCHECK(!snapshot_path->empty());
3605 return true;
3606 }
3607
OpenSnapshotWriter(const android::fs_mgr::CreateLogicalPartitionParams & params,const std::optional<std::string> & source_device)3608 std::unique_ptr<ISnapshotWriter> SnapshotManager::OpenSnapshotWriter(
3609 const android::fs_mgr::CreateLogicalPartitionParams& params,
3610 const std::optional<std::string>& source_device) {
3611 #if defined(LIBSNAPSHOT_NO_COW_WRITE)
3612 (void)params;
3613 (void)source_device;
3614
3615 LOG(ERROR) << "Snapshots cannot be written in first-stage init or recovery";
3616 return nullptr;
3617 #else
3618 // First unmap any existing mapping.
3619 auto lock = LockShared();
3620 if (!lock) return nullptr;
3621 if (!UnmapPartitionWithSnapshot(lock.get(), params.GetPartitionName())) {
3622 LOG(ERROR) << "Cannot unmap existing snapshot before re-mapping it: "
3623 << params.GetPartitionName();
3624 return nullptr;
3625 }
3626
3627 SnapshotPaths paths;
3628 if (!MapPartitionWithSnapshot(lock.get(), params, SnapshotContext::Update, &paths)) {
3629 return nullptr;
3630 }
3631
3632 SnapshotStatus status;
3633 if (!paths.cow_device_name.empty()) {
3634 if (!ReadSnapshotStatus(lock.get(), params.GetPartitionName(), &status)) {
3635 return nullptr;
3636 }
3637 } else {
3638 // Currently, partition_cow_creator always creates snapshots. The
3639 // reason is that if partition X shrinks while partition Y grows, we
3640 // cannot bindly write to the newly freed extents in X. This would
3641 // make the old slot unusable. So, the entire size of the target
3642 // partition is currently considered snapshottable.
3643 LOG(ERROR) << "No snapshot available for partition " << params.GetPartitionName();
3644 return nullptr;
3645 }
3646
3647 if (status.using_snapuserd()) {
3648 return OpenCompressedSnapshotWriter(lock.get(), source_device, params.GetPartitionName(),
3649 status, paths);
3650 }
3651 return OpenKernelSnapshotWriter(lock.get(), source_device, params.GetPartitionName(), status,
3652 paths);
3653 #endif
3654 }
3655
3656 #if !defined(LIBSNAPSHOT_NO_COW_WRITE)
3657 std::unique_ptr<ISnapshotWriter> SnapshotManager::OpenCompressedSnapshotWriter(
3658 LockedFile* lock, const std::optional<std::string>& source_device,
3659 [[maybe_unused]] const std::string& partition_name, const SnapshotStatus& status,
3660 const SnapshotPaths& paths) {
3661 CHECK(lock);
3662
3663 CowOptions cow_options;
3664 cow_options.compression = status.compression_algorithm();
3665 cow_options.max_blocks = {status.device_size() / cow_options.block_size};
3666 cow_options.batch_write = status.batched_writes();
3667 cow_options.num_compress_threads = status.enable_threading() ? 2 : 0;
3668 // Disable scratch space for vts tests
3669 if (device()->IsTestDevice()) {
3670 cow_options.scratch_space = false;
3671 }
3672
3673 // Currently we don't support partial snapshots, since partition_cow_creator
3674 // never creates this scenario.
3675 CHECK(status.snapshot_size() == status.device_size());
3676
3677 auto writer = std::make_unique<CompressedSnapshotWriter>(cow_options);
3678 if (source_device) {
3679 writer->SetSourceDevice(*source_device);
3680 }
3681
3682 std::string cow_path;
3683 if (!GetMappedImageDevicePath(paths.cow_device_name, &cow_path)) {
3684 LOG(ERROR) << "Could not determine path for " << paths.cow_device_name;
3685 return nullptr;
3686 }
3687
3688 unique_fd cow_fd(open(cow_path.c_str(), O_RDWR | O_CLOEXEC));
3689 if (cow_fd < 0) {
3690 PLOG(ERROR) << "OpenCompressedSnapshotWriter: open " << cow_path;
3691 return nullptr;
3692 }
3693 if (!writer->SetCowDevice(std::move(cow_fd))) {
3694 LOG(ERROR) << "Could not create COW writer from " << cow_path;
3695 return nullptr;
3696 }
3697
3698 return writer;
3699 }
3700
3701 std::unique_ptr<ISnapshotWriter> SnapshotManager::OpenKernelSnapshotWriter(
3702 LockedFile* lock, const std::optional<std::string>& source_device,
3703 [[maybe_unused]] const std::string& partition_name, const SnapshotStatus& status,
3704 const SnapshotPaths& paths) {
3705 CHECK(lock);
3706
3707 CowOptions cow_options;
3708 cow_options.max_blocks = {status.device_size() / cow_options.block_size};
3709
3710 auto writer = std::make_unique<OnlineKernelSnapshotWriter>(cow_options);
3711
3712 std::string path = paths.snapshot_device.empty() ? paths.target_device : paths.snapshot_device;
3713 unique_fd fd(open(path.c_str(), O_RDWR | O_CLOEXEC));
3714 if (fd < 0) {
3715 PLOG(ERROR) << "open failed: " << path;
3716 return nullptr;
3717 }
3718
3719 if (source_device) {
3720 writer->SetSourceDevice(*source_device);
3721 }
3722
3723 uint64_t cow_size = status.cow_partition_size() + status.cow_file_size();
3724 writer->SetSnapshotDevice(std::move(fd), cow_size);
3725
3726 return writer;
3727 }
3728 #endif // !defined(LIBSNAPSHOT_NO_COW_WRITE)
3729
UnmapUpdateSnapshot(const std::string & target_partition_name)3730 bool SnapshotManager::UnmapUpdateSnapshot(const std::string& target_partition_name) {
3731 auto lock = LockShared();
3732 if (!lock) return false;
3733 return UnmapPartitionWithSnapshot(lock.get(), target_partition_name);
3734 }
3735
UnmapAllPartitionsInRecovery()3736 bool SnapshotManager::UnmapAllPartitionsInRecovery() {
3737 auto lock = LockExclusive();
3738 if (!lock) return false;
3739
3740 const auto& opener = device_->GetPartitionOpener();
3741 uint32_t slot = SlotNumberForSlotSuffix(device_->GetSlotSuffix());
3742 auto super_device = device_->GetSuperDevice(slot);
3743 auto metadata = android::fs_mgr::ReadMetadata(opener, super_device, slot);
3744 if (!metadata) {
3745 LOG(ERROR) << "Could not read dynamic partition metadata for device: " << super_device;
3746 return false;
3747 }
3748
3749 bool ok = true;
3750 for (const auto& partition : metadata->partitions) {
3751 auto partition_name = GetPartitionName(partition);
3752 ok &= UnmapPartitionWithSnapshot(lock.get(), partition_name);
3753 }
3754 return ok;
3755 }
3756
operator <<(std::ostream & os,SnapshotManager::Slot slot)3757 std::ostream& operator<<(std::ostream& os, SnapshotManager::Slot slot) {
3758 switch (slot) {
3759 case SnapshotManager::Slot::Unknown:
3760 return os << "unknown";
3761 case SnapshotManager::Slot::Source:
3762 return os << "source";
3763 case SnapshotManager::Slot::Target:
3764 return os << "target";
3765 }
3766 }
3767
Dump(std::ostream & os)3768 bool SnapshotManager::Dump(std::ostream& os) {
3769 // Don't actually lock. Dump() is for debugging purposes only, so it is okay
3770 // if it is racy.
3771 auto file = OpenLock(0 /* lock flag */);
3772 if (!file) return false;
3773
3774 std::stringstream ss;
3775
3776 auto update_status = ReadSnapshotUpdateStatus(file.get());
3777
3778 ss << "Update state: " << update_status.state() << std::endl;
3779 ss << "Using snapuserd: " << update_status.using_snapuserd() << std::endl;
3780 ss << "Using userspace snapshots: " << update_status.userspace_snapshots() << std::endl;
3781 ss << "Using io_uring: " << update_status.io_uring_enabled() << std::endl;
3782 ss << "Using XOR compression: " << GetXorCompressionEnabledProperty() << std::endl;
3783 ss << "Current slot: " << device_->GetSlotSuffix() << std::endl;
3784 ss << "Boot indicator: booting from " << GetCurrentSlot() << " slot" << std::endl;
3785 ss << "Rollback indicator: "
3786 << (access(GetRollbackIndicatorPath().c_str(), F_OK) == 0 ? "exists" : strerror(errno))
3787 << std::endl;
3788 ss << "Forward merge indicator: "
3789 << (access(GetForwardMergeIndicatorPath().c_str(), F_OK) == 0 ? "exists" : strerror(errno))
3790 << std::endl;
3791 ss << "Source build fingerprint: " << update_status.source_build_fingerprint() << std::endl;
3792
3793 if (update_status.state() == UpdateState::Merging) {
3794 ss << "Merge completion: ";
3795 if (!EnsureSnapuserdConnected()) {
3796 ss << "N/A";
3797 } else {
3798 ss << snapuserd_client_->GetMergePercent() << "%";
3799 }
3800 ss << std::endl;
3801 ss << "Merge phase: " << update_status.merge_phase() << std::endl;
3802 }
3803
3804 bool ok = true;
3805 std::vector<std::string> snapshots;
3806 if (!ListSnapshots(file.get(), &snapshots)) {
3807 LOG(ERROR) << "Could not list snapshots";
3808 snapshots.clear();
3809 ok = false;
3810 }
3811 for (const auto& name : snapshots) {
3812 ss << "Snapshot: " << name << std::endl;
3813 SnapshotStatus status;
3814 if (!ReadSnapshotStatus(file.get(), name, &status)) {
3815 ok = false;
3816 continue;
3817 }
3818 ss << " state: " << SnapshotState_Name(status.state()) << std::endl;
3819 ss << " device size (bytes): " << status.device_size() << std::endl;
3820 ss << " snapshot size (bytes): " << status.snapshot_size() << std::endl;
3821 ss << " cow partition size (bytes): " << status.cow_partition_size() << std::endl;
3822 ss << " cow file size (bytes): " << status.cow_file_size() << std::endl;
3823 ss << " allocated sectors: " << status.sectors_allocated() << std::endl;
3824 ss << " metadata sectors: " << status.metadata_sectors() << std::endl;
3825 ss << " compression: " << status.compression_algorithm() << std::endl;
3826 ss << " merge phase: " << DecideMergePhase(status) << std::endl;
3827 }
3828 os << ss.rdbuf();
3829 return ok;
3830 }
3831
EnsureMetadataMounted()3832 std::unique_ptr<AutoDevice> SnapshotManager::EnsureMetadataMounted() {
3833 if (!device_->IsRecovery()) {
3834 // No need to mount anything in recovery.
3835 LOG(INFO) << "EnsureMetadataMounted does nothing in Android mode.";
3836 return std::unique_ptr<AutoUnmountDevice>(new AutoUnmountDevice());
3837 }
3838 auto ret = AutoUnmountDevice::New(device_->GetMetadataDir());
3839 if (ret == nullptr) return nullptr;
3840
3841 // In rescue mode, it is possible to erase and format metadata, but /metadata/ota is not
3842 // created to execute snapshot updates. Hence, subsequent calls is likely to fail because
3843 // Lock*() fails. By failing early and returning nullptr here, update_engine_sideload can
3844 // treat this case as if /metadata is not mounted.
3845 if (!LockShared()) {
3846 LOG(WARNING) << "/metadata is mounted, but errors occur when acquiring a shared lock. "
3847 "Subsequent calls to SnapshotManager will fail. Unmounting /metadata now.";
3848 return nullptr;
3849 }
3850 return ret;
3851 }
3852
HandleImminentDataWipe(const std::function<void ()> & callback)3853 bool SnapshotManager::HandleImminentDataWipe(const std::function<void()>& callback) {
3854 if (!device_->IsRecovery()) {
3855 LOG(ERROR) << "Data wipes are only allowed in recovery.";
3856 return false;
3857 }
3858
3859 auto mount = EnsureMetadataMounted();
3860 if (!mount || !mount->HasDevice()) {
3861 // We allow the wipe to continue, because if we can't mount /metadata,
3862 // it is unlikely the device would have booted anyway. If there is no
3863 // metadata partition, then the device predates Virtual A/B.
3864 return true;
3865 }
3866
3867 // Check this early, so we don't accidentally start trying to populate
3868 // the state file in recovery. Note we don't call GetUpdateState since
3869 // we want errors in acquiring the lock to be propagated, instead of
3870 // returning UpdateState::None.
3871 auto state_file = GetStateFilePath();
3872 if (access(state_file.c_str(), F_OK) != 0 && errno == ENOENT) {
3873 return true;
3874 }
3875
3876 auto slot_number = SlotNumberForSlotSuffix(device_->GetSlotSuffix());
3877 auto super_path = device_->GetSuperDevice(slot_number);
3878 if (!CreateLogicalAndSnapshotPartitions(super_path, 20s)) {
3879 LOG(ERROR) << "Unable to map partitions to complete merge.";
3880 return false;
3881 }
3882
3883 auto process_callback = [&]() -> bool {
3884 if (callback) {
3885 callback();
3886 }
3887 return true;
3888 };
3889
3890 in_factory_data_reset_ = true;
3891 UpdateState state =
3892 ProcessUpdateStateOnDataWipe(true /* allow_forward_merge */, process_callback);
3893 in_factory_data_reset_ = false;
3894
3895 if (state == UpdateState::MergeFailed) {
3896 return false;
3897 }
3898
3899 // Nothing should be depending on partitions now, so unmap them all.
3900 if (!UnmapAllPartitionsInRecovery()) {
3901 LOG(ERROR) << "Unable to unmap all partitions; fastboot may fail to flash.";
3902 }
3903
3904 if (state != UpdateState::None) {
3905 auto lock = LockExclusive();
3906 if (!lock) return false;
3907
3908 // Zap the update state so the bootloader doesn't think we're still
3909 // merging. It's okay if this fails, it's informative only at this
3910 // point.
3911 WriteUpdateState(lock.get(), UpdateState::None);
3912 }
3913 return true;
3914 }
3915
FinishMergeInRecovery()3916 bool SnapshotManager::FinishMergeInRecovery() {
3917 if (!device_->IsRecovery()) {
3918 LOG(ERROR) << "Data wipes are only allowed in recovery.";
3919 return false;
3920 }
3921
3922 auto mount = EnsureMetadataMounted();
3923 if (!mount || !mount->HasDevice()) {
3924 return false;
3925 }
3926
3927 auto slot_number = SlotNumberForSlotSuffix(device_->GetSlotSuffix());
3928 auto super_path = device_->GetSuperDevice(slot_number);
3929 if (!CreateLogicalAndSnapshotPartitions(super_path, 20s)) {
3930 LOG(ERROR) << "Unable to map partitions to complete merge.";
3931 return false;
3932 }
3933
3934 UpdateState state = ProcessUpdateState();
3935 if (state != UpdateState::MergeCompleted) {
3936 LOG(ERROR) << "Merge returned unexpected status: " << state;
3937 return false;
3938 }
3939
3940 // Nothing should be depending on partitions now, so unmap them all.
3941 if (!UnmapAllPartitionsInRecovery()) {
3942 LOG(ERROR) << "Unable to unmap all partitions; fastboot may fail to flash.";
3943 }
3944 return true;
3945 }
3946
ProcessUpdateStateOnDataWipe(bool allow_forward_merge,const std::function<bool ()> & callback)3947 UpdateState SnapshotManager::ProcessUpdateStateOnDataWipe(bool allow_forward_merge,
3948 const std::function<bool()>& callback) {
3949 auto slot_number = SlotNumberForSlotSuffix(device_->GetSlotSuffix());
3950 UpdateState state = ProcessUpdateState(callback);
3951 LOG(INFO) << "Update state in recovery: " << state;
3952 switch (state) {
3953 case UpdateState::MergeFailed:
3954 LOG(ERROR) << "Unrecoverable merge failure detected.";
3955 return state;
3956 case UpdateState::Unverified: {
3957 // If an OTA was just applied but has not yet started merging:
3958 //
3959 // - if forward merge is allowed, initiate merge and call
3960 // ProcessUpdateState again.
3961 //
3962 // - if forward merge is not allowed, we
3963 // have no choice but to revert slots, because the current slot will
3964 // immediately become unbootable. Rather than wait for the device
3965 // to reboot N times until a rollback, we proactively disable the
3966 // new slot instead.
3967 //
3968 // Since the rollback is inevitable, we don't treat a HAL failure
3969 // as an error here.
3970 auto slot = GetCurrentSlot();
3971 if (slot == Slot::Target) {
3972 if (allow_forward_merge &&
3973 access(GetForwardMergeIndicatorPath().c_str(), F_OK) == 0) {
3974 LOG(INFO) << "Forward merge allowed, initiating merge now.";
3975
3976 if (!InitiateMerge()) {
3977 LOG(ERROR) << "Failed to initiate merge on data wipe.";
3978 return UpdateState::MergeFailed;
3979 }
3980 return ProcessUpdateStateOnDataWipe(false /* allow_forward_merge */, callback);
3981 }
3982
3983 LOG(ERROR) << "Reverting to old slot since update will be deleted.";
3984 device_->SetSlotAsUnbootable(slot_number);
3985 } else {
3986 LOG(INFO) << "Booting from " << slot << " slot, no action is taken.";
3987 }
3988 break;
3989 }
3990 case UpdateState::MergeNeedsReboot:
3991 // We shouldn't get here, because nothing is depending on
3992 // logical partitions.
3993 LOG(ERROR) << "Unexpected merge-needs-reboot state in recovery.";
3994 break;
3995 default:
3996 break;
3997 }
3998 return state;
3999 }
4000
EnsureNoOverflowSnapshot(LockedFile * lock)4001 bool SnapshotManager::EnsureNoOverflowSnapshot(LockedFile* lock) {
4002 CHECK(lock);
4003
4004 std::vector<std::string> snapshots;
4005 if (!ListSnapshots(lock, &snapshots)) {
4006 LOG(ERROR) << "Could not list snapshots.";
4007 return false;
4008 }
4009
4010 for (const auto& snapshot : snapshots) {
4011 SnapshotStatus status;
4012 if (!ReadSnapshotStatus(lock, snapshot, &status)) {
4013 return false;
4014 }
4015 if (status.using_snapuserd()) {
4016 continue;
4017 }
4018
4019 std::vector<DeviceMapper::TargetInfo> targets;
4020 if (!dm_.GetTableStatus(snapshot, &targets)) {
4021 LOG(ERROR) << "Could not read snapshot device table: " << snapshot;
4022 return false;
4023 }
4024 if (targets.size() != 1) {
4025 LOG(ERROR) << "Unexpected device-mapper table for snapshot: " << snapshot
4026 << ", size = " << targets.size();
4027 return false;
4028 }
4029 if (targets[0].IsOverflowSnapshot()) {
4030 LOG(ERROR) << "Detected overflow in snapshot " << snapshot
4031 << ", CoW device size computation is wrong!";
4032 return false;
4033 }
4034 }
4035
4036 return true;
4037 }
4038
RecoveryCreateSnapshotDevices()4039 CreateResult SnapshotManager::RecoveryCreateSnapshotDevices() {
4040 if (!device_->IsRecovery()) {
4041 LOG(ERROR) << __func__ << " is only allowed in recovery.";
4042 return CreateResult::NOT_CREATED;
4043 }
4044
4045 auto mount = EnsureMetadataMounted();
4046 if (!mount || !mount->HasDevice()) {
4047 LOG(ERROR) << "Couldn't mount Metadata.";
4048 return CreateResult::NOT_CREATED;
4049 }
4050 return RecoveryCreateSnapshotDevices(mount);
4051 }
4052
RecoveryCreateSnapshotDevices(const std::unique_ptr<AutoDevice> & metadata_device)4053 CreateResult SnapshotManager::RecoveryCreateSnapshotDevices(
4054 const std::unique_ptr<AutoDevice>& metadata_device) {
4055 if (!device_->IsRecovery()) {
4056 LOG(ERROR) << __func__ << " is only allowed in recovery.";
4057 return CreateResult::NOT_CREATED;
4058 }
4059
4060 if (metadata_device == nullptr || !metadata_device->HasDevice()) {
4061 LOG(ERROR) << "Metadata not mounted.";
4062 return CreateResult::NOT_CREATED;
4063 }
4064
4065 auto state_file = GetStateFilePath();
4066 if (access(state_file.c_str(), F_OK) != 0 && errno == ENOENT) {
4067 LOG(ERROR) << "Couldn't access state file.";
4068 return CreateResult::NOT_CREATED;
4069 }
4070
4071 if (!NeedSnapshotsInFirstStageMount()) {
4072 return CreateResult::NOT_CREATED;
4073 }
4074
4075 auto slot_suffix = device_->GetOtherSlotSuffix();
4076 auto slot_number = SlotNumberForSlotSuffix(slot_suffix);
4077 auto super_path = device_->GetSuperDevice(slot_number);
4078 if (!CreateLogicalAndSnapshotPartitions(super_path, 20s)) {
4079 LOG(ERROR) << "Unable to map partitions.";
4080 return CreateResult::ERROR;
4081 }
4082 return CreateResult::CREATED;
4083 }
4084
UpdateForwardMergeIndicator(bool wipe)4085 bool SnapshotManager::UpdateForwardMergeIndicator(bool wipe) {
4086 auto path = GetForwardMergeIndicatorPath();
4087
4088 if (!wipe) {
4089 LOG(INFO) << "Wipe is not scheduled. Deleting forward merge indicator.";
4090 return RemoveFileIfExists(path);
4091 }
4092
4093 // TODO(b/152094219): Don't forward merge if no CoW file is allocated.
4094
4095 LOG(INFO) << "Wipe will be scheduled. Allowing forward merge of snapshots.";
4096 if (!android::base::WriteStringToFile("1", path)) {
4097 PLOG(ERROR) << "Unable to write forward merge indicator: " << path;
4098 return false;
4099 }
4100
4101 return true;
4102 }
4103
GetSnapshotMergeStatsInstance()4104 ISnapshotMergeStats* SnapshotManager::GetSnapshotMergeStatsInstance() {
4105 return SnapshotMergeStats::GetInstance(*this);
4106 }
4107
4108 // This is only to be used in recovery or normal Android (not first-stage init).
4109 // We don't guarantee dm paths are available in first-stage init, because ueventd
4110 // isn't running yet.
GetMappedImageDevicePath(const std::string & device_name,std::string * device_path)4111 bool SnapshotManager::GetMappedImageDevicePath(const std::string& device_name,
4112 std::string* device_path) {
4113 // Try getting the device string if it is a device mapper device.
4114 if (dm_.GetState(device_name) != DmDeviceState::INVALID) {
4115 return dm_.GetDmDevicePathByName(device_name, device_path);
4116 }
4117
4118 // Otherwise, get path from IImageManager.
4119 return images_->GetMappedImageDevice(device_name, device_path);
4120 }
4121
GetMappedImageDeviceStringOrPath(const std::string & device_name,std::string * device_string_or_mapped_path)4122 bool SnapshotManager::GetMappedImageDeviceStringOrPath(const std::string& device_name,
4123 std::string* device_string_or_mapped_path) {
4124 // Try getting the device string if it is a device mapper device.
4125 if (dm_.GetState(device_name) != DmDeviceState::INVALID) {
4126 return dm_.GetDeviceString(device_name, device_string_or_mapped_path);
4127 }
4128
4129 // Otherwise, get path from IImageManager.
4130 if (!images_->GetMappedImageDevice(device_name, device_string_or_mapped_path)) {
4131 return false;
4132 }
4133
4134 LOG(WARNING) << "Calling GetMappedImageDevice with local image manager; device "
4135 << (device_string_or_mapped_path ? *device_string_or_mapped_path : "(nullptr)")
4136 << "may not be available in first stage init! ";
4137 return true;
4138 }
4139
WaitForDevice(const std::string & device,std::chrono::milliseconds timeout_ms)4140 bool SnapshotManager::WaitForDevice(const std::string& device,
4141 std::chrono::milliseconds timeout_ms) {
4142 if (!android::base::StartsWith(device, "/")) {
4143 return true;
4144 }
4145
4146 // In first-stage init, we rely on init setting a callback which can
4147 // regenerate uevents and populate /dev for us.
4148 if (uevent_regen_callback_) {
4149 if (!uevent_regen_callback_(device)) {
4150 LOG(ERROR) << "Failed to find device after regenerating uevents: " << device;
4151 return false;
4152 }
4153 return true;
4154 }
4155
4156 // Otherwise, the only kind of device we need to wait for is a dm-user
4157 // misc device. Normal calls to DeviceMapper::CreateDevice() guarantee
4158 // the path has been created.
4159 if (!android::base::StartsWith(device, "/dev/dm-user/")) {
4160 return true;
4161 }
4162
4163 if (timeout_ms.count() == 0) {
4164 LOG(ERROR) << "No timeout was specified to wait for device: " << device;
4165 return false;
4166 }
4167 if (!android::fs_mgr::WaitForFile(device, timeout_ms)) {
4168 LOG(ERROR) << "Timed out waiting for device to appear: " << device;
4169 return false;
4170 }
4171 return true;
4172 }
4173
IsSnapuserdRequired()4174 bool SnapshotManager::IsSnapuserdRequired() {
4175 auto lock = LockExclusive();
4176 if (!lock) return false;
4177
4178 auto status = ReadSnapshotUpdateStatus(lock.get());
4179 return status.state() != UpdateState::None && status.using_snapuserd();
4180 }
4181
PrepareSnapuserdArgsForSelinux(std::vector<std::string> * snapuserd_argv)4182 bool SnapshotManager::PrepareSnapuserdArgsForSelinux(std::vector<std::string>* snapuserd_argv) {
4183 return PerformInitTransition(InitTransition::SELINUX_DETACH, snapuserd_argv);
4184 }
4185
DetachFirstStageSnapuserdForSelinux()4186 bool SnapshotManager::DetachFirstStageSnapuserdForSelinux() {
4187 LOG(INFO) << "Detaching first stage snapuserd";
4188
4189 auto lock = LockExclusive();
4190 if (!lock) return false;
4191
4192 std::vector<std::string> snapshots;
4193 if (!ListSnapshots(lock.get(), &snapshots)) {
4194 LOG(ERROR) << "Failed to list snapshots.";
4195 return false;
4196 }
4197
4198 size_t num_cows = 0;
4199 size_t ok_cows = 0;
4200 for (const auto& snapshot : snapshots) {
4201 std::string user_cow_name = GetDmUserCowName(snapshot, GetSnapshotDriver(lock.get()));
4202
4203 if (dm_.GetState(user_cow_name) == DmDeviceState::INVALID) {
4204 continue;
4205 }
4206
4207 DeviceMapper::TargetInfo target;
4208 if (!GetSingleTarget(user_cow_name, TableQuery::Table, &target)) {
4209 continue;
4210 }
4211
4212 auto target_type = DeviceMapper::GetTargetType(target.spec);
4213 if (target_type != "user") {
4214 LOG(ERROR) << "Unexpected target type for " << user_cow_name << ": " << target_type;
4215 continue;
4216 }
4217
4218 num_cows++;
4219 auto misc_name = user_cow_name;
4220
4221 DmTable table;
4222 table.Emplace<DmTargetUser>(0, target.spec.length, misc_name);
4223 if (!dm_.LoadTableAndActivate(user_cow_name, table)) {
4224 LOG(ERROR) << "Unable to swap tables for " << misc_name;
4225 continue;
4226 }
4227
4228 // Wait for ueventd to acknowledge and create the control device node.
4229 std::string control_device = "/dev/dm-user/" + misc_name;
4230 if (!WaitForDevice(control_device, 10s)) {
4231 LOG(ERROR) << "dm-user control device no found: " << misc_name;
4232 continue;
4233 }
4234
4235 ok_cows++;
4236 LOG(INFO) << "control device is ready: " << control_device;
4237 }
4238
4239 if (ok_cows != num_cows) {
4240 LOG(ERROR) << "Could not transition all snapuserd consumers.";
4241 return false;
4242 }
4243
4244 return true;
4245 }
4246
PerformSecondStageInitTransition()4247 bool SnapshotManager::PerformSecondStageInitTransition() {
4248 return PerformInitTransition(InitTransition::SECOND_STAGE);
4249 }
4250
ReadOldPartitionMetadata(LockedFile * lock)4251 const LpMetadata* SnapshotManager::ReadOldPartitionMetadata(LockedFile* lock) {
4252 CHECK(lock);
4253
4254 if (!old_partition_metadata_) {
4255 auto path = GetOldPartitionMetadataPath();
4256 old_partition_metadata_ = android::fs_mgr::ReadFromImageFile(path);
4257 if (!old_partition_metadata_) {
4258 LOG(ERROR) << "Could not read old partition metadata from " << path;
4259 return nullptr;
4260 }
4261 }
4262 return old_partition_metadata_.get();
4263 }
4264
DecideMergePhase(const SnapshotStatus & status)4265 MergePhase SnapshotManager::DecideMergePhase(const SnapshotStatus& status) {
4266 if (status.using_snapuserd() && status.device_size() < status.old_partition_size()) {
4267 return MergePhase::FIRST_PHASE;
4268 }
4269 return MergePhase::SECOND_PHASE;
4270 }
4271
UpdateCowStats(ISnapshotMergeStats * stats)4272 void SnapshotManager::UpdateCowStats(ISnapshotMergeStats* stats) {
4273 auto lock = LockExclusive();
4274 if (!lock) return;
4275
4276 std::vector<std::string> snapshots;
4277 if (!ListSnapshots(lock.get(), &snapshots, GetSnapshotSlotSuffix())) {
4278 LOG(ERROR) << "Could not list snapshots";
4279 return;
4280 }
4281
4282 uint64_t cow_file_size = 0;
4283 uint64_t total_cow_size = 0;
4284 uint64_t estimated_cow_size = 0;
4285 for (const auto& snapshot : snapshots) {
4286 SnapshotStatus status;
4287 if (!ReadSnapshotStatus(lock.get(), snapshot, &status)) {
4288 return;
4289 }
4290
4291 cow_file_size += status.cow_file_size();
4292 total_cow_size += status.cow_file_size() + status.cow_partition_size();
4293 estimated_cow_size += status.estimated_cow_size();
4294 }
4295
4296 stats->report()->set_cow_file_size(cow_file_size);
4297 stats->report()->set_total_cow_size_bytes(total_cow_size);
4298 stats->report()->set_estimated_cow_size_bytes(estimated_cow_size);
4299 }
4300
SetMergeStatsFeatures(ISnapshotMergeStats * stats)4301 void SnapshotManager::SetMergeStatsFeatures(ISnapshotMergeStats* stats) {
4302 auto lock = LockExclusive();
4303 if (!lock) return;
4304
4305 SnapshotUpdateStatus update_status = ReadSnapshotUpdateStatus(lock.get());
4306 stats->report()->set_iouring_used(update_status.io_uring_enabled());
4307 stats->report()->set_userspace_snapshots_used(update_status.userspace_snapshots());
4308 stats->report()->set_xor_compression_used(GetXorCompressionEnabledProperty());
4309 }
4310
DeleteDeviceIfExists(const std::string & name,const std::chrono::milliseconds & timeout_ms)4311 bool SnapshotManager::DeleteDeviceIfExists(const std::string& name,
4312 const std::chrono::milliseconds& timeout_ms) {
4313 auto start = std::chrono::steady_clock::now();
4314 while (true) {
4315 if (dm_.DeleteDeviceIfExists(name)) {
4316 return true;
4317 }
4318 auto now = std::chrono::steady_clock::now();
4319 auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(now - start);
4320 if (elapsed >= timeout_ms) {
4321 break;
4322 }
4323 std::this_thread::sleep_for(400ms);
4324 }
4325
4326 // Try to diagnose why this failed. First get the actual device path.
4327 std::string full_path;
4328 if (!dm_.GetDmDevicePathByName(name, &full_path)) {
4329 LOG(ERROR) << "Unable to diagnose DM_DEV_REMOVE failure.";
4330 return false;
4331 }
4332
4333 // Check for child dm-devices.
4334 std::string block_name = android::base::Basename(full_path);
4335 std::string sysfs_holders = "/sys/class/block/" + block_name + "/holders";
4336
4337 std::error_code ec;
4338 std::filesystem::directory_iterator dir_iter(sysfs_holders, ec);
4339 if (auto begin = std::filesystem::begin(dir_iter); begin != std::filesystem::end(dir_iter)) {
4340 LOG(ERROR) << "Child device-mapper device still mapped: " << begin->path();
4341 return false;
4342 }
4343
4344 // Check for mounted partitions.
4345 android::fs_mgr::Fstab fstab;
4346 android::fs_mgr::ReadFstabFromFile("/proc/mounts", &fstab);
4347 for (const auto& entry : fstab) {
4348 if (android::base::Basename(entry.blk_device) == block_name) {
4349 LOG(ERROR) << "Partition still mounted: " << entry.mount_point;
4350 return false;
4351 }
4352 }
4353
4354 // Check for detached mounted partitions.
4355 for (const auto& fs : std::filesystem::directory_iterator("/sys/fs", ec)) {
4356 std::string fs_type = android::base::Basename(fs.path().c_str());
4357 if (!(fs_type == "ext4" || fs_type == "f2fs")) {
4358 continue;
4359 }
4360
4361 std::string path = fs.path().c_str() + "/"s + block_name;
4362 if (access(path.c_str(), F_OK) == 0) {
4363 LOG(ERROR) << "Block device was lazily unmounted and is still in-use: " << full_path
4364 << "; possibly open file descriptor or attached loop device.";
4365 return false;
4366 }
4367 }
4368
4369 LOG(ERROR) << "Device-mapper device " << name << "(" << full_path << ")"
4370 << " still in use."
4371 << " Probably a file descriptor was leaked or held open, or a loop device is"
4372 << " attached.";
4373 return false;
4374 }
4375
ReadMergeFailureCode()4376 MergeFailureCode SnapshotManager::ReadMergeFailureCode() {
4377 auto lock = LockExclusive();
4378 if (!lock) return MergeFailureCode::AcquireLock;
4379
4380 SnapshotUpdateStatus status = ReadSnapshotUpdateStatus(lock.get());
4381 if (status.state() != UpdateState::MergeFailed) {
4382 return MergeFailureCode::Ok;
4383 }
4384 return status.merge_failure_code();
4385 }
4386
ReadSourceBuildFingerprint()4387 std::string SnapshotManager::ReadSourceBuildFingerprint() {
4388 auto lock = LockExclusive();
4389 if (!lock) return {};
4390
4391 SnapshotUpdateStatus status = ReadSnapshotUpdateStatus(lock.get());
4392 return status.source_build_fingerprint();
4393 }
4394
IsUserspaceSnapshotUpdateInProgress()4395 bool SnapshotManager::IsUserspaceSnapshotUpdateInProgress() {
4396 auto slot = GetCurrentSlot();
4397 if (slot == Slot::Target) {
4398 if (IsSnapuserdRequired()) {
4399 return true;
4400 }
4401 }
4402
4403 return false;
4404 }
4405
4406 } // namespace snapshot
4407 } // namespace android
4408