1 /*
2 * Copyright (C) 2020 The Android Open Source Project
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17 #include "snapuserd.h"
18
19 #include <dirent.h>
20 #include <fcntl.h>
21 #include <linux/fs.h>
22 #include <unistd.h>
23 #include <algorithm>
24
25 #include <csignal>
26 #include <optional>
27 #include <set>
28
29 #include <android-base/file.h>
30 #include <android-base/logging.h>
31 #include <android-base/parseint.h>
32 #include <android-base/properties.h>
33 #include <android-base/strings.h>
34 #include <android-base/unique_fd.h>
35 #include <snapuserd/snapuserd_client.h>
36
37 namespace android {
38 namespace snapshot {
39
40 using namespace android;
41 using namespace android::dm;
42 using android::base::unique_fd;
43
44 #define SNAP_LOG(level) LOG(level) << misc_name_ << ": "
45 #define SNAP_PLOG(level) PLOG(level) << misc_name_ << ": "
46
Snapuserd(const std::string & misc_name,const std::string & cow_device,const std::string & backing_device)47 Snapuserd::Snapuserd(const std::string& misc_name, const std::string& cow_device,
48 const std::string& backing_device) {
49 misc_name_ = misc_name;
50 cow_device_ = cow_device;
51 backing_store_device_ = backing_device;
52 control_device_ = "/dev/dm-user/" + misc_name;
53 }
54
InitializeWorkers()55 bool Snapuserd::InitializeWorkers() {
56 for (int i = 0; i < NUM_THREADS_PER_PARTITION; i++) {
57 std::unique_ptr<WorkerThread> wt = std::make_unique<WorkerThread>(
58 cow_device_, backing_store_device_, control_device_, misc_name_, GetSharedPtr());
59
60 worker_threads_.push_back(std::move(wt));
61 }
62
63 read_ahead_thread_ = std::make_unique<ReadAheadThread>(cow_device_, backing_store_device_,
64 misc_name_, GetSharedPtr());
65 return true;
66 }
67
CloneReaderForWorker()68 std::unique_ptr<CowReader> Snapuserd::CloneReaderForWorker() {
69 return reader_->CloneCowReader();
70 }
71
CommitMerge(int num_merge_ops)72 bool Snapuserd::CommitMerge(int num_merge_ops) {
73 struct CowHeader* ch = reinterpret_cast<struct CowHeader*>(mapped_addr_);
74 ch->num_merge_ops += num_merge_ops;
75
76 if (read_ahead_feature_ && read_ahead_ops_.size() > 0) {
77 struct BufferState* ra_state = GetBufferState();
78 ra_state->read_ahead_state = kCowReadAheadInProgress;
79 }
80
81 int ret = msync(mapped_addr_, BLOCK_SZ, MS_SYNC);
82 if (ret < 0) {
83 SNAP_PLOG(ERROR) << "msync header failed: " << ret;
84 return false;
85 }
86
87 merge_initiated_ = true;
88
89 return true;
90 }
91
PrepareReadAhead()92 void Snapuserd::PrepareReadAhead() {
93 if (!read_ahead_feature_) {
94 return;
95 }
96
97 struct BufferState* ra_state = GetBufferState();
98 // Check if the data has to be re-constructed from COW device
99 if (ra_state->read_ahead_state == kCowReadAheadDone) {
100 populate_data_from_cow_ = true;
101 } else {
102 populate_data_from_cow_ = false;
103 }
104
105 StartReadAhead();
106 }
107
GetRABuffer(std::unique_lock<std::mutex> * lock,uint64_t block,void * buffer)108 bool Snapuserd::GetRABuffer(std::unique_lock<std::mutex>* lock, uint64_t block, void* buffer) {
109 if (!lock->owns_lock()) {
110 SNAP_LOG(ERROR) << "GetRABuffer - Lock not held";
111 return false;
112 }
113 std::unordered_map<uint64_t, void*>::iterator it = read_ahead_buffer_map_.find(block);
114
115 // This will be true only for IO's generated as part of reading a root
116 // filesystem. IO's related to merge should always be in read-ahead cache.
117 if (it == read_ahead_buffer_map_.end()) {
118 return false;
119 }
120
121 // Theoretically, we can send the data back from the read-ahead buffer
122 // all the way to the kernel without memcpy. However, if the IO is
123 // un-aligned, the wrapper function will need to touch the read-ahead
124 // buffers and transitions will be bit more complicated.
125 memcpy(buffer, it->second, BLOCK_SZ);
126 return true;
127 }
128
129 // ========== State transition functions for read-ahead operations ===========
130
GetReadAheadPopulatedBuffer(uint64_t block,void * buffer)131 bool Snapuserd::GetReadAheadPopulatedBuffer(uint64_t block, void* buffer) {
132 if (!read_ahead_feature_) {
133 return false;
134 }
135
136 {
137 std::unique_lock<std::mutex> lock(lock_);
138 if (io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE) {
139 return false;
140 }
141
142 if (io_state_ == READ_AHEAD_IO_TRANSITION::IO_IN_PROGRESS) {
143 return GetRABuffer(&lock, block, buffer);
144 }
145 }
146
147 {
148 // Read-ahead thread IO is in-progress. Wait for it to complete
149 std::unique_lock<std::mutex> lock(lock_);
150 while (!(io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE ||
151 io_state_ == READ_AHEAD_IO_TRANSITION::IO_IN_PROGRESS)) {
152 cv.wait(lock);
153 }
154
155 return GetRABuffer(&lock, block, buffer);
156 }
157 }
158
159 // This is invoked by read-ahead thread waiting for merge IO's
160 // to complete
WaitForMergeToComplete()161 bool Snapuserd::WaitForMergeToComplete() {
162 {
163 std::unique_lock<std::mutex> lock(lock_);
164 while (!(io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_BEGIN ||
165 io_state_ == READ_AHEAD_IO_TRANSITION::IO_TERMINATED)) {
166 cv.wait(lock);
167 }
168
169 if (io_state_ == READ_AHEAD_IO_TRANSITION::IO_TERMINATED) {
170 return false;
171 }
172
173 io_state_ = READ_AHEAD_IO_TRANSITION::READ_AHEAD_IN_PROGRESS;
174 return true;
175 }
176 }
177
178 // This is invoked during the launch of worker threads. We wait
179 // for read-ahead thread to by fully up before worker threads
180 // are launched; else we will have a race between worker threads
181 // and read-ahead thread specifically during re-construction.
WaitForReadAheadToStart()182 bool Snapuserd::WaitForReadAheadToStart() {
183 {
184 std::unique_lock<std::mutex> lock(lock_);
185 while (!(io_state_ == READ_AHEAD_IO_TRANSITION::IO_IN_PROGRESS ||
186 io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE)) {
187 cv.wait(lock);
188 }
189
190 if (io_state_ == READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE) {
191 return false;
192 }
193
194 return true;
195 }
196 }
197
198 // Invoked by worker threads when a sequence of merge operation
199 // is complete notifying read-ahead thread to make forward
200 // progress.
StartReadAhead()201 void Snapuserd::StartReadAhead() {
202 {
203 std::lock_guard<std::mutex> lock(lock_);
204 io_state_ = READ_AHEAD_IO_TRANSITION::READ_AHEAD_BEGIN;
205 }
206
207 cv.notify_one();
208 }
209
MergeCompleted()210 void Snapuserd::MergeCompleted() {
211 {
212 std::lock_guard<std::mutex> lock(lock_);
213 io_state_ = READ_AHEAD_IO_TRANSITION::IO_TERMINATED;
214 }
215
216 cv.notify_one();
217 }
218
ReadAheadIOCompleted(bool sync)219 bool Snapuserd::ReadAheadIOCompleted(bool sync) {
220 if (sync) {
221 // Flush the entire buffer region
222 int ret = msync(mapped_addr_, total_mapped_addr_length_, MS_SYNC);
223 if (ret < 0) {
224 PLOG(ERROR) << "msync failed after ReadAheadIOCompleted: " << ret;
225 return false;
226 }
227
228 // Metadata and data are synced. Now, update the state.
229 // We need to update the state after flushing data; if there is a crash
230 // when read-ahead IO is in progress, the state of data in the COW file
231 // is unknown. kCowReadAheadDone acts as a checkpoint wherein the data
232 // in the scratch space is good and during next reboot, read-ahead thread
233 // can safely re-construct the data.
234 struct BufferState* ra_state = GetBufferState();
235 ra_state->read_ahead_state = kCowReadAheadDone;
236
237 ret = msync(mapped_addr_, BLOCK_SZ, MS_SYNC);
238 if (ret < 0) {
239 PLOG(ERROR) << "msync failed to flush Readahead completion state...";
240 return false;
241 }
242 }
243
244 // Notify the worker threads
245 {
246 std::lock_guard<std::mutex> lock(lock_);
247 io_state_ = READ_AHEAD_IO_TRANSITION::IO_IN_PROGRESS;
248 }
249
250 cv.notify_all();
251 return true;
252 }
253
ReadAheadIOFailed()254 void Snapuserd::ReadAheadIOFailed() {
255 {
256 std::lock_guard<std::mutex> lock(lock_);
257 io_state_ = READ_AHEAD_IO_TRANSITION::READ_AHEAD_FAILURE;
258 }
259
260 cv.notify_all();
261 }
262
263 //========== End of state transition functions ====================
264
IsChunkIdMetadata(chunk_t chunk)265 bool Snapuserd::IsChunkIdMetadata(chunk_t chunk) {
266 uint32_t stride = exceptions_per_area_ + 1;
267 lldiv_t divresult = lldiv(chunk, stride);
268
269 return (divresult.rem == NUM_SNAPSHOT_HDR_CHUNKS);
270 }
271
272 // Find the next free chunk-id to be assigned. Check if the next free
273 // chunk-id represents a metadata page. If so, skip it.
GetNextAllocatableChunkId(chunk_t chunk)274 chunk_t Snapuserd::GetNextAllocatableChunkId(chunk_t chunk) {
275 chunk_t next_chunk = chunk + 1;
276
277 if (IsChunkIdMetadata(next_chunk)) {
278 next_chunk += 1;
279 }
280 return next_chunk;
281 }
282
CheckMergeCompletionStatus()283 void Snapuserd::CheckMergeCompletionStatus() {
284 if (!merge_initiated_) {
285 SNAP_LOG(INFO) << "Merge was not initiated. Total-data-ops: "
286 << reader_->get_num_total_data_ops();
287 return;
288 }
289
290 struct CowHeader* ch = reinterpret_cast<struct CowHeader*>(mapped_addr_);
291
292 SNAP_LOG(INFO) << "Merge-status: Total-Merged-ops: " << ch->num_merge_ops
293 << " Total-data-ops: " << reader_->get_num_total_data_ops();
294 }
295
296 /*
297 * Read the metadata from COW device and
298 * construct the metadata as required by the kernel.
299 *
300 * Please see design on kernel COW format
301 *
302 * 1: Read the metadata from internal COW device
303 * 2: There are 3 COW operations:
304 * a: Replace op
305 * b: Copy op
306 * c: Zero op
307 * 3: For each of the 3 operations, op->new_block
308 * represents the block number in the base device
309 * for which one of the 3 operations have to be applied.
310 * This represents the old_chunk in the kernel COW format
311 * 4: We need to assign new_chunk for a corresponding old_chunk
312 * 5: The algorithm is similar to how kernel assigns chunk number
313 * while creating exceptions. However, there are few cases
314 * which needs to be addressed here:
315 * a: During merge process, kernel scans the metadata page
316 * from backwards when merge is initiated. Since, we need
317 * to make sure that the merge ordering follows our COW format,
318 * we read the COW operation from backwards and populate the
319 * metadata so that when kernel starts the merging from backwards,
320 * those ops correspond to the beginning of our COW format.
321 * b: Kernel can merge successive operations if the two chunk IDs
322 * are contiguous. This can be problematic when there is a crash
323 * during merge; specifically when the merge operation has dependency.
324 * These dependencies can only happen during copy operations.
325 *
326 * To avoid this problem, we make sure overlap copy operations
327 * are not batch merged.
328 * 6: Use a monotonically increasing chunk number to assign the
329 * new_chunk
330 * 7: Each chunk-id represents either
331 * a: Metadata page or
332 * b: Data page
333 * 8: Chunk-id representing a data page is stored in a map.
334 * 9: Chunk-id representing a metadata page is converted into a vector
335 * index. We store this in vector as kernel requests metadata during
336 * two stage:
337 * a: When initial dm-snapshot device is created, kernel requests
338 * all the metadata and stores it in its internal data-structures.
339 * b: During merge, kernel once again requests the same metadata
340 * once-again.
341 * In both these cases, a quick lookup based on chunk-id is done.
342 * 10: When chunk number is incremented, we need to make sure that
343 * if the chunk is representing a metadata page and skip.
344 * 11: Each 4k page will contain 256 disk exceptions. We call this
345 * exceptions_per_area_
346 * 12: Kernel will stop issuing metadata IO request when new-chunk ID is 0.
347 */
ReadMetadata()348 bool Snapuserd::ReadMetadata() {
349 reader_ = std::make_unique<CowReader>();
350 CowHeader header;
351 CowOptions options;
352 bool metadata_found = false;
353 int replace_ops = 0, zero_ops = 0, copy_ops = 0;
354
355 SNAP_LOG(DEBUG) << "ReadMetadata: Parsing cow file";
356
357 if (!reader_->Parse(cow_fd_)) {
358 SNAP_LOG(ERROR) << "Failed to parse";
359 return false;
360 }
361
362 if (!reader_->GetHeader(&header)) {
363 SNAP_LOG(ERROR) << "Failed to get header";
364 return false;
365 }
366
367 if (!(header.block_size == BLOCK_SZ)) {
368 SNAP_LOG(ERROR) << "Invalid header block size found: " << header.block_size;
369 return false;
370 }
371
372 SNAP_LOG(DEBUG) << "Merge-ops: " << header.num_merge_ops;
373
374 if (!MmapMetadata()) {
375 SNAP_LOG(ERROR) << "mmap failed";
376 return false;
377 }
378
379 // Initialize the iterator for reading metadata
380 std::unique_ptr<ICowOpIter> cowop_rm_iter = reader_->GetRevMergeOpIter();
381
382 exceptions_per_area_ = (CHUNK_SIZE << SECTOR_SHIFT) / sizeof(struct disk_exception);
383
384 // Start from chunk number 2. Chunk 0 represents header and chunk 1
385 // represents first metadata page.
386 chunk_t data_chunk_id = NUM_SNAPSHOT_HDR_CHUNKS + 1;
387 size_t num_ops = 0;
388
389 loff_t offset = 0;
390 std::unique_ptr<uint8_t[]> de_ptr =
391 std::make_unique<uint8_t[]>(exceptions_per_area_ * sizeof(struct disk_exception));
392
393 // This memset is important. Kernel will stop issuing IO when new-chunk ID
394 // is 0. When Area is not filled completely with all 256 exceptions,
395 // this memset will ensure that metadata read is completed.
396 memset(de_ptr.get(), 0, (exceptions_per_area_ * sizeof(struct disk_exception)));
397
398 while (!cowop_rm_iter->Done()) {
399 const CowOperation* cow_op = &cowop_rm_iter->Get();
400 struct disk_exception* de =
401 reinterpret_cast<struct disk_exception*>((char*)de_ptr.get() + offset);
402
403 metadata_found = true;
404 // This loop will handle all the replace and zero ops.
405 // We will handle the copy ops later as it requires special
406 // handling of assigning chunk-id's. Furthermore, we make
407 // sure that replace/zero and copy ops are not batch merged; hence,
408 // the bump in the chunk_id before break of this loop
409 if (IsOrderedOp(*cow_op)) {
410 data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
411 break;
412 }
413
414 if (cow_op->type == kCowReplaceOp) {
415 replace_ops++;
416 } else if (cow_op->type == kCowZeroOp) {
417 zero_ops++;
418 }
419
420 // Construct the disk-exception
421 de->old_chunk = cow_op->new_block;
422 de->new_chunk = data_chunk_id;
423
424 // Store operation pointer.
425 chunk_vec_.push_back(std::make_pair(ChunkToSector(data_chunk_id), cow_op));
426 num_ops += 1;
427 offset += sizeof(struct disk_exception);
428 cowop_rm_iter->Next();
429
430 SNAP_LOG(DEBUG) << num_ops << ":"
431 << " Old-chunk: " << de->old_chunk << " New-chunk: " << de->new_chunk;
432
433 if (num_ops == exceptions_per_area_) {
434 // Store it in vector at the right index. This maps the chunk-id to
435 // vector index.
436 vec_.push_back(std::move(de_ptr));
437 offset = 0;
438 num_ops = 0;
439
440 // Create buffer for next area
441 de_ptr = std::make_unique<uint8_t[]>(exceptions_per_area_ *
442 sizeof(struct disk_exception));
443 memset(de_ptr.get(), 0, (exceptions_per_area_ * sizeof(struct disk_exception)));
444
445 if (cowop_rm_iter->Done()) {
446 vec_.push_back(std::move(de_ptr));
447 }
448 }
449
450 data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
451 }
452
453 int num_ra_ops_per_iter = ((GetBufferDataSize()) / BLOCK_SZ);
454 std::optional<chunk_t> prev_id = {};
455 std::vector<const CowOperation*> vec;
456 std::set<uint64_t> dest_blocks;
457 std::set<uint64_t> source_blocks;
458 size_t pending_ordered_ops = exceptions_per_area_ - num_ops;
459 uint64_t total_ordered_ops = reader_->get_num_ordered_ops_to_merge();
460
461 SNAP_LOG(DEBUG) << " Processing copy-ops at Area: " << vec_.size()
462 << " Number of replace/zero ops completed in this area: " << num_ops
463 << " Pending copy ops for this area: " << pending_ordered_ops;
464
465 while (!cowop_rm_iter->Done()) {
466 do {
467 const CowOperation* cow_op = &cowop_rm_iter->Get();
468
469 // We have two cases specific cases:
470 //
471 // =====================================================
472 // Case 1: Overlapping copy regions
473 //
474 // Ex:
475 //
476 // Source -> Destination
477 //
478 // 1: 15 -> 18
479 // 2: 16 -> 19
480 // 3: 17 -> 20
481 // 4: 18 -> 21
482 // 5: 19 -> 22
483 // 6: 20 -> 23
484 //
485 // We have 6 copy operations to be executed in OTA and there is a overlap. Update-engine
486 // will write to COW file as follows:
487 //
488 // Op-1: 20 -> 23
489 // Op-2: 19 -> 22
490 // Op-3: 18 -> 21
491 // Op-4: 17 -> 20
492 // Op-5: 16 -> 19
493 // Op-6: 15 -> 18
494 //
495 // Note that the blocks numbers are contiguous. Hence, all 6 copy
496 // operations can be batch merged. However, that will be
497 // problematic if we have a crash as block 20, 19, 18 would have
498 // been overwritten and hence subsequent recovery may end up with
499 // a silent data corruption when op-1, op-2 and op-3 are
500 // re-executed.
501 //
502 // To address the above problem, read-ahead thread will
503 // read all the 6 source blocks, cache them in the scratch
504 // space of the COW file. During merge, read-ahead
505 // thread will serve the blocks from the read-ahead cache.
506 // If there is a crash during merge; on subsequent reboot,
507 // read-ahead thread will recover the data from the
508 // scratch space and re-construct it thereby there
509 // is no loss of data.
510 //
511 // Note that we will follow the same order of COW operations
512 // as present in the COW file. This will make sure that
513 // the merge of operations are done based on the ops present
514 // in the file.
515 //===========================================================
516 uint64_t block_source = cow_op->source;
517 uint64_t block_offset = 0;
518 if (prev_id.has_value()) {
519 if (dest_blocks.count(cow_op->new_block) || source_blocks.count(block_source) ||
520 (block_offset > 0 && source_blocks.count(block_source + 1))) {
521 break;
522 }
523 }
524 metadata_found = true;
525 pending_ordered_ops -= 1;
526 vec.push_back(cow_op);
527 dest_blocks.insert(block_source);
528 if (block_offset > 0) {
529 dest_blocks.insert(block_source + 1);
530 }
531 source_blocks.insert(cow_op->new_block);
532 prev_id = cow_op->new_block;
533 cowop_rm_iter->Next();
534 } while (!cowop_rm_iter->Done() && pending_ordered_ops);
535
536 data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
537 SNAP_LOG(DEBUG) << "Batch Merge copy-ops of size: " << vec.size()
538 << " Area: " << vec_.size() << " Area offset: " << offset
539 << " Pending-ordered-ops in this area: " << pending_ordered_ops;
540
541 for (size_t i = 0; i < vec.size(); i++) {
542 struct disk_exception* de =
543 reinterpret_cast<struct disk_exception*>((char*)de_ptr.get() + offset);
544 const CowOperation* cow_op = vec[i];
545
546 de->old_chunk = cow_op->new_block;
547 de->new_chunk = data_chunk_id;
548
549 // Store operation pointer.
550 chunk_vec_.push_back(std::make_pair(ChunkToSector(data_chunk_id), cow_op));
551 offset += sizeof(struct disk_exception);
552 num_ops += 1;
553 if (cow_op->type == kCowCopyOp) {
554 copy_ops++;
555 }
556
557 if (read_ahead_feature_) {
558 read_ahead_ops_.push_back(cow_op);
559 }
560
561 SNAP_LOG(DEBUG) << num_ops << ":"
562 << " Ordered-op: "
563 << " Old-chunk: " << de->old_chunk << " New-chunk: " << de->new_chunk;
564
565 if (num_ops == exceptions_per_area_) {
566 // Store it in vector at the right index. This maps the chunk-id to
567 // vector index.
568 vec_.push_back(std::move(de_ptr));
569 num_ops = 0;
570 offset = 0;
571
572 // Create buffer for next area
573 de_ptr = std::make_unique<uint8_t[]>(exceptions_per_area_ *
574 sizeof(struct disk_exception));
575 memset(de_ptr.get(), 0, (exceptions_per_area_ * sizeof(struct disk_exception)));
576
577 if (cowop_rm_iter->Done()) {
578 vec_.push_back(std::move(de_ptr));
579 SNAP_LOG(DEBUG) << "ReadMetadata() completed; Number of Areas: " << vec_.size();
580 }
581
582 if (!(pending_ordered_ops == 0)) {
583 SNAP_LOG(ERROR) << "Invalid pending_ordered_ops: expected: 0 found: "
584 << pending_ordered_ops;
585 return false;
586 }
587 pending_ordered_ops = exceptions_per_area_;
588 }
589
590 data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
591 total_ordered_ops -= 1;
592 /*
593 * Split the number of ops based on the size of read-ahead buffer
594 * region. We need to ensure that kernel doesn't issue IO on blocks
595 * which are not read by the read-ahead thread.
596 */
597 if (read_ahead_feature_ && (total_ordered_ops % num_ra_ops_per_iter == 0)) {
598 data_chunk_id = GetNextAllocatableChunkId(data_chunk_id);
599 }
600 }
601 vec.clear();
602 dest_blocks.clear();
603 source_blocks.clear();
604 prev_id.reset();
605 }
606
607 // Partially filled area or there is no metadata
608 // If there is no metadata, fill with zero so that kernel
609 // is aware that merge is completed.
610 if (num_ops || !metadata_found) {
611 vec_.push_back(std::move(de_ptr));
612 SNAP_LOG(DEBUG) << "ReadMetadata() completed. Partially filled area num_ops: " << num_ops
613 << "Areas : " << vec_.size();
614 }
615
616 chunk_vec_.shrink_to_fit();
617 vec_.shrink_to_fit();
618 read_ahead_ops_.shrink_to_fit();
619
620 // Sort the vector based on sectors as we need this during un-aligned access
621 std::sort(chunk_vec_.begin(), chunk_vec_.end(), compare);
622
623 SNAP_LOG(INFO) << "ReadMetadata completed. Final-chunk-id: " << data_chunk_id
624 << " Num Sector: " << ChunkToSector(data_chunk_id)
625 << " Replace-ops: " << replace_ops << " Zero-ops: " << zero_ops
626 << " Copy-ops: " << copy_ops << " Areas: " << vec_.size()
627 << " Num-ops-merged: " << header.num_merge_ops
628 << " Total-data-ops: " << reader_->get_num_total_data_ops();
629
630 // Total number of sectors required for creating dm-user device
631 num_sectors_ = ChunkToSector(data_chunk_id);
632 merge_initiated_ = false;
633 PrepareReadAhead();
634
635 return true;
636 }
637
MmapMetadata()638 bool Snapuserd::MmapMetadata() {
639 CowHeader header;
640 reader_->GetHeader(&header);
641
642 if (header.major_version >= 2 && header.buffer_size > 0) {
643 total_mapped_addr_length_ = header.header_size + BUFFER_REGION_DEFAULT_SIZE;
644 read_ahead_feature_ = true;
645 } else {
646 // mmap the first 4k page - older COW format
647 total_mapped_addr_length_ = BLOCK_SZ;
648 read_ahead_feature_ = false;
649 }
650
651 mapped_addr_ = mmap(NULL, total_mapped_addr_length_, PROT_READ | PROT_WRITE, MAP_SHARED,
652 cow_fd_.get(), 0);
653 if (mapped_addr_ == MAP_FAILED) {
654 SNAP_LOG(ERROR) << "mmap metadata failed";
655 return false;
656 }
657
658 return true;
659 }
660
UnmapBufferRegion()661 void Snapuserd::UnmapBufferRegion() {
662 int ret = munmap(mapped_addr_, total_mapped_addr_length_);
663 if (ret < 0) {
664 SNAP_PLOG(ERROR) << "munmap failed";
665 }
666 }
667
MyLogger(android::base::LogId,android::base::LogSeverity severity,const char *,const char *,unsigned int,const char * message)668 void MyLogger(android::base::LogId, android::base::LogSeverity severity, const char*, const char*,
669 unsigned int, const char* message) {
670 if (severity == android::base::ERROR) {
671 fprintf(stderr, "%s\n", message);
672 } else {
673 fprintf(stdout, "%s\n", message);
674 }
675 }
676
InitCowDevice()677 bool Snapuserd::InitCowDevice() {
678 cow_fd_.reset(open(cow_device_.c_str(), O_RDWR));
679 if (cow_fd_ < 0) {
680 SNAP_PLOG(ERROR) << "Open Failed: " << cow_device_;
681 return false;
682 }
683
684 return ReadMetadata();
685 }
686
ReadBlocksToCache(const std::string & dm_block_device,const std::string & partition_name,off_t offset,size_t size)687 void Snapuserd::ReadBlocksToCache(const std::string& dm_block_device,
688 const std::string& partition_name, off_t offset, size_t size) {
689 android::base::unique_fd fd(TEMP_FAILURE_RETRY(open(dm_block_device.c_str(), O_RDONLY)));
690 if (fd.get() == -1) {
691 SNAP_PLOG(ERROR) << "Error reading " << dm_block_device
692 << " partition-name: " << partition_name;
693 return;
694 }
695
696 size_t remain = size;
697 off_t file_offset = offset;
698 // We pick 4M I/O size based on the fact that the current
699 // update_verifier has a similar I/O size.
700 size_t read_sz = 1024 * BLOCK_SZ;
701 std::vector<uint8_t> buf(read_sz);
702
703 while (remain > 0) {
704 size_t to_read = std::min(remain, read_sz);
705
706 if (!android::base::ReadFullyAtOffset(fd.get(), buf.data(), to_read, file_offset)) {
707 SNAP_PLOG(ERROR) << "Failed to read block from block device: " << dm_block_device
708 << " at offset: " << file_offset
709 << " partition-name: " << partition_name << " total-size: " << size
710 << " remain_size: " << remain;
711 return;
712 }
713
714 file_offset += to_read;
715 remain -= to_read;
716 }
717
718 SNAP_LOG(INFO) << "Finished reading block-device: " << dm_block_device
719 << " partition: " << partition_name << " size: " << size
720 << " offset: " << offset;
721 }
722
ReadBlocks(const std::string & partition_name,const std::string & dm_block_device)723 void Snapuserd::ReadBlocks(const std::string& partition_name, const std::string& dm_block_device) {
724 SNAP_LOG(DEBUG) << "Reading partition: " << partition_name
725 << " Block-Device: " << dm_block_device;
726
727 uint64_t dev_sz = 0;
728
729 unique_fd fd(TEMP_FAILURE_RETRY(open(dm_block_device.c_str(), O_RDONLY | O_CLOEXEC)));
730 if (fd < 0) {
731 SNAP_LOG(ERROR) << "Cannot open block device";
732 return;
733 }
734
735 dev_sz = get_block_device_size(fd.get());
736 if (!dev_sz) {
737 SNAP_PLOG(ERROR) << "Could not determine block device size: " << dm_block_device;
738 return;
739 }
740
741 int num_threads = 2;
742 size_t num_blocks = dev_sz >> BLOCK_SHIFT;
743 size_t num_blocks_per_thread = num_blocks / num_threads;
744 size_t read_sz_per_thread = num_blocks_per_thread << BLOCK_SHIFT;
745 off_t offset = 0;
746
747 for (int i = 0; i < num_threads; i++) {
748 std::async(std::launch::async, &Snapuserd::ReadBlocksToCache, this, dm_block_device,
749 partition_name, offset, read_sz_per_thread);
750
751 offset += read_sz_per_thread;
752 }
753 }
754
755 /*
756 * Entry point to launch threads
757 */
Start()758 bool Snapuserd::Start() {
759 std::vector<std::future<bool>> threads;
760 std::future<bool> ra_thread;
761 bool rathread = (read_ahead_feature_ && (read_ahead_ops_.size() > 0));
762
763 // Start the read-ahead thread and wait
764 // for it as the data has to be re-constructed
765 // from COW device.
766 if (rathread) {
767 ra_thread = std::async(std::launch::async, &ReadAheadThread::RunThread,
768 read_ahead_thread_.get());
769 if (!WaitForReadAheadToStart()) {
770 SNAP_LOG(ERROR) << "Failed to start Read-ahead thread...";
771 return false;
772 }
773
774 SNAP_LOG(INFO) << "Read-ahead thread started...";
775 }
776
777 // Launch worker threads
778 for (int i = 0; i < worker_threads_.size(); i++) {
779 threads.emplace_back(
780 std::async(std::launch::async, &WorkerThread::RunThread, worker_threads_[i].get()));
781 }
782
783 bool second_stage_init = true;
784
785 // We don't want to read the blocks during first stage init.
786 if (android::base::EndsWith(misc_name_, "-init") || is_socket_present_) {
787 second_stage_init = false;
788 }
789
790 if (second_stage_init) {
791 SNAP_LOG(INFO) << "Reading blocks to cache....";
792 auto& dm = DeviceMapper::Instance();
793 auto dm_block_devices = dm.FindDmPartitions();
794 if (dm_block_devices.empty()) {
795 SNAP_LOG(ERROR) << "No dm-enabled block device is found.";
796 } else {
797 auto parts = android::base::Split(misc_name_, "-");
798 std::string partition_name = parts[0];
799
800 const char* suffix_b = "_b";
801 const char* suffix_a = "_a";
802
803 partition_name.erase(partition_name.find_last_not_of(suffix_b) + 1);
804 partition_name.erase(partition_name.find_last_not_of(suffix_a) + 1);
805
806 if (dm_block_devices.find(partition_name) == dm_block_devices.end()) {
807 SNAP_LOG(ERROR) << "Failed to find dm block device for " << partition_name;
808 } else {
809 ReadBlocks(partition_name, dm_block_devices.at(partition_name));
810 }
811 }
812 } else {
813 SNAP_LOG(INFO) << "Not reading block device into cache";
814 }
815
816 bool ret = true;
817 for (auto& t : threads) {
818 ret = t.get() && ret;
819 }
820
821 if (rathread) {
822 // Notify the read-ahead thread that all worker threads
823 // are done. We need this explicit notification when
824 // there is an IO failure or there was a switch
825 // of dm-user table; thus, forcing the read-ahead
826 // thread to wake up.
827 MergeCompleted();
828 ret = ret && ra_thread.get();
829 }
830
831 return ret;
832 }
833
GetBufferMetadataOffset()834 uint64_t Snapuserd::GetBufferMetadataOffset() {
835 CowHeader header;
836 reader_->GetHeader(&header);
837
838 size_t size = header.header_size + sizeof(BufferState);
839 return size;
840 }
841
842 /*
843 * Metadata for read-ahead is 16 bytes. For a 2 MB region, we will
844 * end up with 8k (2 PAGE) worth of metadata. Thus, a 2MB buffer
845 * region is split into:
846 *
847 * 1: 8k metadata
848 *
849 */
GetBufferMetadataSize()850 size_t Snapuserd::GetBufferMetadataSize() {
851 CowHeader header;
852 reader_->GetHeader(&header);
853
854 size_t metadata_bytes = (header.buffer_size * sizeof(struct ScratchMetadata)) / BLOCK_SZ;
855 return metadata_bytes;
856 }
857
GetBufferDataOffset()858 size_t Snapuserd::GetBufferDataOffset() {
859 CowHeader header;
860 reader_->GetHeader(&header);
861
862 return (header.header_size + GetBufferMetadataSize());
863 }
864
865 /*
866 * (2MB - 8K = 2088960 bytes) will be the buffer region to hold the data.
867 */
GetBufferDataSize()868 size_t Snapuserd::GetBufferDataSize() {
869 CowHeader header;
870 reader_->GetHeader(&header);
871
872 size_t size = header.buffer_size - GetBufferMetadataSize();
873 return size;
874 }
875
GetBufferState()876 struct BufferState* Snapuserd::GetBufferState() {
877 CowHeader header;
878 reader_->GetHeader(&header);
879
880 struct BufferState* ra_state =
881 reinterpret_cast<struct BufferState*>((char*)mapped_addr_ + header.header_size);
882 return ra_state;
883 }
884
885 } // namespace snapshot
886 } // namespace android
887