1 /* 2 * Copyright (C) 2008 The Android Open Source Project 3 * 4 * Licensed under the Apache License, Version 2.0 (the "License"); 5 * you may not use this file except in compliance with the License. 6 * You may obtain a copy of the License at 7 * 8 * http://www.apache.org/licenses/LICENSE-2.0 9 * 10 * Unless required by applicable law or agreed to in writing, software 11 * distributed under the License is distributed on an "AS IS" BASIS, 12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 13 * See the License for the specific language governing permissions and 14 * limitations under the License. 15 */ 16 17 package com.android.server; 18 19 import static com.android.server.Watchdog.HandlerCheckerAndTimeout.withCustomTimeout; 20 import static com.android.server.Watchdog.HandlerCheckerAndTimeout.withDefaultTimeout; 21 22 import android.annotation.NonNull; 23 import android.annotation.Nullable; 24 import android.app.IActivityController; 25 import android.content.BroadcastReceiver; 26 import android.content.Context; 27 import android.content.Intent; 28 import android.content.IntentFilter; 29 import android.database.ContentObserver; 30 import android.hidl.manager.V1_0.IServiceManager; 31 import android.net.Uri; 32 import android.os.Binder; 33 import android.os.Build; 34 import android.os.Debug; 35 import android.os.FileUtils; 36 import android.os.Handler; 37 import android.os.IPowerManager; 38 import android.os.Looper; 39 import android.os.Process; 40 import android.os.RemoteException; 41 import android.os.ServiceDebugInfo; 42 import android.os.ServiceManager; 43 import android.os.SystemClock; 44 import android.os.SystemProperties; 45 import android.os.UserHandle; 46 import android.provider.Settings; 47 import android.sysprop.WatchdogProperties; 48 import android.util.Dumpable; 49 import android.util.EventLog; 50 import android.util.Log; 51 import android.util.Slog; 52 import android.util.SparseBooleanArray; 53 54 import com.android.internal.os.BackgroundThread; 55 import com.android.internal.os.ProcessCpuTracker; 56 import com.android.internal.os.ZygoteConnectionConstants; 57 import com.android.internal.util.FrameworkStatsLog; 58 import com.android.server.am.ActivityManagerService; 59 import com.android.server.am.StackTracesDumpHelper; 60 import com.android.server.am.TraceErrorLogger; 61 import com.android.server.criticalevents.CriticalEventLog; 62 import com.android.server.wm.SurfaceAnimationThread; 63 64 import java.io.BufferedReader; 65 import java.io.File; 66 import java.io.FileNotFoundException; 67 import java.io.FileReader; 68 import java.io.FileWriter; 69 import java.io.IOException; 70 import java.io.PrintWriter; 71 import java.io.StringWriter; 72 import java.util.ArrayList; 73 import java.util.Arrays; 74 import java.util.Collections; 75 import java.util.HashSet; 76 import java.util.List; 77 import java.util.Optional; 78 import java.util.UUID; 79 import java.util.concurrent.CompletableFuture; 80 import java.util.concurrent.TimeUnit; 81 82 /** 83 * This class calls its monitor every minute. Killing this process if they don't return 84 **/ 85 public class Watchdog implements Dumpable { 86 static final String TAG = "Watchdog"; 87 88 /** Debug flag. */ 89 public static final boolean DEBUG = false; 90 91 // Set this to true to use debug default values. 92 private static final boolean DB = false; 93 94 // Note 1: Do not lower this value below thirty seconds without tightening the invoke-with 95 // timeout in com.android.internal.os.ZygoteConnection, or wrapped applications 96 // can trigger the watchdog. 97 // Note 2: The debug value is already below the wait time in ZygoteConnection. Wrapped 98 // applications may not work with a debug build. CTS will fail. 99 private static final long DEFAULT_TIMEOUT = DB ? 10 * 1000 : 60 * 1000; 100 101 // These are temporally ordered: larger values as lateness increases 102 private static final int COMPLETED = 0; 103 private static final int WAITING = 1; 104 private static final int WAITED_HALF = 2; 105 private static final int OVERDUE = 3; 106 107 // Track watchdog timeout history and break the crash loop if there is. 108 private static final String TIMEOUT_HISTORY_FILE = "/data/system/watchdog-timeout-history.txt"; 109 private static final String PROP_FATAL_LOOP_COUNT = "framework_watchdog.fatal_count"; 110 private static final String PROP_FATAL_LOOP_WINDOWS_SECS = 111 "framework_watchdog.fatal_window.second"; 112 113 // Which native processes to dump into dropbox's stack traces 114 public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] { 115 "/system/bin/audioserver", 116 "/system/bin/cameraserver", 117 "/system/bin/drmserver", 118 "/system/bin/keystore2", 119 "/system/bin/mediadrmserver", 120 "/system/bin/mediaserver", 121 "/system/bin/netd", 122 "/system/bin/sdcard", 123 "/system/bin/surfaceflinger", 124 "/system/bin/vold", 125 "media.extractor", // system/bin/mediaextractor 126 "media.metrics", // system/bin/mediametrics 127 "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service 128 "media.swcodec", // /apex/com.android.media.swcodec/bin/mediaswcodec 129 "media.transcoding", // Media transcoding service 130 "com.android.bluetooth", // Bluetooth service 131 "/apex/com.android.os.statsd/bin/statsd", // Stats daemon 132 }; 133 134 public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList( 135 "android.hardware.audio@4.0::IDevicesFactory", 136 "android.hardware.audio@5.0::IDevicesFactory", 137 "android.hardware.audio@6.0::IDevicesFactory", 138 "android.hardware.audio@7.0::IDevicesFactory", 139 "android.hardware.biometrics.face@1.0::IBiometricsFace", 140 "android.hardware.biometrics.fingerprint@2.1::IBiometricsFingerprint", 141 "android.hardware.bluetooth@1.0::IBluetoothHci", 142 "android.hardware.camera.provider@2.4::ICameraProvider", 143 "android.hardware.gnss@1.0::IGnss", 144 "android.hardware.graphics.allocator@2.0::IAllocator", 145 "android.hardware.graphics.composer@2.1::IComposer", 146 "android.hardware.health@2.0::IHealth", 147 "android.hardware.light@2.0::ILight", 148 "android.hardware.media.c2@1.0::IComponentStore", 149 "android.hardware.media.omx@1.0::IOmx", 150 "android.hardware.media.omx@1.0::IOmxStore", 151 "android.hardware.neuralnetworks@1.0::IDevice", 152 "android.hardware.power@1.0::IPower", 153 "android.hardware.power.stats@1.0::IPowerStats", 154 "android.hardware.sensors@1.0::ISensors", 155 "android.hardware.sensors@2.0::ISensors", 156 "android.hardware.sensors@2.1::ISensors", 157 "android.hardware.vibrator@1.0::IVibrator", 158 "android.hardware.vr@1.0::IVr", 159 "android.system.suspend@1.0::ISystemSuspend" 160 ); 161 162 public static final String[] AIDL_INTERFACE_PREFIXES_OF_INTEREST = new String[] { 163 "android.hardware.audio.core.IModule/", 164 "android.hardware.audio.core.IConfig/", 165 "android.hardware.biometrics.face.IFace/", 166 "android.hardware.biometrics.fingerprint.IFingerprint/", 167 "android.hardware.bluetooth.IBluetoothHci/", 168 "android.hardware.camera.provider.ICameraProvider/", 169 "android.hardware.gnss.IGnss/", 170 "android.hardware.graphics.allocator.IAllocator/", 171 "android.hardware.graphics.composer3.IComposer/", 172 "android.hardware.health.IHealth/", 173 "android.hardware.input.processor.IInputProcessor/", 174 "android.hardware.light.ILights/", 175 "android.hardware.neuralnetworks.IDevice/", 176 "android.hardware.power.IPower/", 177 "android.hardware.power.stats.IPowerStats/", 178 "android.hardware.sensors.ISensors/", 179 "android.hardware.vibrator.IVibrator/", 180 "android.hardware.vibrator.IVibratorManager/", 181 "android.system.suspend.ISystemSuspend/", 182 }; 183 184 private static Watchdog sWatchdog; 185 186 private final Thread mThread; 187 188 private final Object mLock = new Object(); 189 190 /* This handler will be used to post message back onto the main thread */ 191 private final ArrayList<HandlerCheckerAndTimeout> mHandlerCheckers = new ArrayList<>(); 192 private final HandlerChecker mMonitorChecker; 193 private ActivityManagerService mActivity; 194 private IActivityController mController; 195 private boolean mAllowRestart = true; 196 // We start with DEFAULT_TIMEOUT. This will then be update with the timeout values from Settings 197 // once the settings provider is initialized. 198 private volatile long mWatchdogTimeoutMillis = DEFAULT_TIMEOUT; 199 private final List<Integer> mInterestingJavaPids = new ArrayList<>(); 200 private final TraceErrorLogger mTraceErrorLogger; 201 202 /** Holds a checker and its timeout. */ 203 static final class HandlerCheckerAndTimeout { 204 private final HandlerChecker mHandler; 205 private final Optional<Long> mCustomTimeoutMillis; 206 HandlerCheckerAndTimeout(HandlerChecker checker, Optional<Long> timeoutMillis)207 private HandlerCheckerAndTimeout(HandlerChecker checker, Optional<Long> timeoutMillis) { 208 this.mHandler = checker; 209 this.mCustomTimeoutMillis = timeoutMillis; 210 } 211 checker()212 HandlerChecker checker() { 213 return mHandler; 214 } 215 216 /** Returns the timeout. */ customTimeoutMillis()217 Optional<Long> customTimeoutMillis() { 218 return mCustomTimeoutMillis; 219 } 220 221 /** 222 * Creates a checker with the default timeout. The timeout will use the default value which 223 * is configurable server-side. 224 */ withDefaultTimeout(HandlerChecker checker)225 static HandlerCheckerAndTimeout withDefaultTimeout(HandlerChecker checker) { 226 return new HandlerCheckerAndTimeout(checker, Optional.empty()); 227 } 228 229 /** 230 * Creates a checker with a custom timeout. The timeout overrides the default value and will 231 * always be used. 232 */ withCustomTimeout( HandlerChecker checker, long timeoutMillis)233 static HandlerCheckerAndTimeout withCustomTimeout( 234 HandlerChecker checker, long timeoutMillis) { 235 return new HandlerCheckerAndTimeout(checker, Optional.of(timeoutMillis)); 236 } 237 } 238 239 /** 240 * Used for checking status of handle threads and scheduling monitor callbacks. 241 */ 242 public final class HandlerChecker implements Runnable { 243 private final Handler mHandler; 244 private final String mName; 245 private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>(); 246 private final ArrayList<Monitor> mMonitorQueue = new ArrayList<Monitor>(); 247 private long mWaitMaxMillis; 248 private boolean mCompleted; 249 private Monitor mCurrentMonitor; 250 private long mStartTimeMillis; 251 private int mPauseCount; 252 HandlerChecker(Handler handler, String name)253 HandlerChecker(Handler handler, String name) { 254 mHandler = handler; 255 mName = name; 256 mCompleted = true; 257 } 258 addMonitorLocked(Monitor monitor)259 void addMonitorLocked(Monitor monitor) { 260 // We don't want to update mMonitors when the Handler is in the middle of checking 261 // all monitors. We will update mMonitors on the next schedule if it is safe 262 mMonitorQueue.add(monitor); 263 } 264 265 /** 266 * Schedules a run on the handler thread. 267 * 268 * @param handlerCheckerTimeoutMillis the timeout to use for this run 269 */ scheduleCheckLocked(long handlerCheckerTimeoutMillis)270 public void scheduleCheckLocked(long handlerCheckerTimeoutMillis) { 271 mWaitMaxMillis = handlerCheckerTimeoutMillis; 272 if (mCompleted) { 273 // Safe to update monitors in queue, Handler is not in the middle of work 274 mMonitors.addAll(mMonitorQueue); 275 mMonitorQueue.clear(); 276 } 277 if ((mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) 278 || (mPauseCount > 0)) { 279 // Don't schedule until after resume OR 280 // If the target looper has recently been polling, then 281 // there is no reason to enqueue our checker on it since that 282 // is as good as it not being deadlocked. This avoid having 283 // to do a context switch to check the thread. Note that we 284 // only do this if we have no monitors since those would need to 285 // be executed at this point. 286 mCompleted = true; 287 return; 288 } 289 if (!mCompleted) { 290 // we already have a check in flight, so no need 291 return; 292 } 293 294 mCompleted = false; 295 mCurrentMonitor = null; 296 mStartTimeMillis = SystemClock.uptimeMillis(); 297 mHandler.postAtFrontOfQueue(this); 298 } 299 getCompletionStateLocked()300 public int getCompletionStateLocked() { 301 if (mCompleted) { 302 return COMPLETED; 303 } else { 304 long latency = SystemClock.uptimeMillis() - mStartTimeMillis; 305 if (latency < mWaitMaxMillis / 2) { 306 return WAITING; 307 } else if (latency < mWaitMaxMillis) { 308 return WAITED_HALF; 309 } 310 } 311 return OVERDUE; 312 } 313 getThread()314 public Thread getThread() { 315 return mHandler.getLooper().getThread(); 316 } 317 getName()318 public String getName() { 319 return mName; 320 } 321 describeBlockedStateLocked()322 String describeBlockedStateLocked() { 323 final String prefix; 324 if (mCurrentMonitor == null) { 325 prefix = "Blocked in handler on "; 326 } else { 327 prefix = "Blocked in monitor " + mCurrentMonitor.getClass().getName(); 328 } 329 long latencySeconds = (SystemClock.uptimeMillis() - mStartTimeMillis) / 1000; 330 return prefix + " on " + mName + " (" + getThread().getName() + ")" 331 + " for " + latencySeconds + "s"; 332 } 333 334 @Override run()335 public void run() { 336 // Once we get here, we ensure that mMonitors does not change even if we call 337 // #addMonitorLocked because we first add the new monitors to mMonitorQueue and 338 // move them to mMonitors on the next schedule when mCompleted is true, at which 339 // point we have completed execution of this method. 340 final int size = mMonitors.size(); 341 for (int i = 0 ; i < size ; i++) { 342 synchronized (mLock) { 343 mCurrentMonitor = mMonitors.get(i); 344 } 345 mCurrentMonitor.monitor(); 346 } 347 348 synchronized (mLock) { 349 mCompleted = true; 350 mCurrentMonitor = null; 351 } 352 } 353 354 /** Pause the HandlerChecker. */ pauseLocked(String reason)355 public void pauseLocked(String reason) { 356 mPauseCount++; 357 // Mark as completed, because there's a chance we called this after the watchog 358 // thread loop called Object#wait after 'WAITED_HALF'. In that case we want to ensure 359 // the next call to #getCompletionStateLocked for this checker returns 'COMPLETED' 360 mCompleted = true; 361 Slog.i(TAG, "Pausing HandlerChecker: " + mName + " for reason: " 362 + reason + ". Pause count: " + mPauseCount); 363 } 364 365 /** Resume the HandlerChecker from the last {@link #pauseLocked}. */ resumeLocked(String reason)366 public void resumeLocked(String reason) { 367 if (mPauseCount > 0) { 368 mPauseCount--; 369 Slog.i(TAG, "Resuming HandlerChecker: " + mName + " for reason: " 370 + reason + ". Pause count: " + mPauseCount); 371 } else { 372 Slog.wtf(TAG, "Already resumed HandlerChecker: " + mName); 373 } 374 } 375 } 376 377 final class RebootRequestReceiver extends BroadcastReceiver { 378 @Override onReceive(Context c, Intent intent)379 public void onReceive(Context c, Intent intent) { 380 if (intent.getIntExtra("nowait", 0) != 0) { 381 rebootSystem("Received ACTION_REBOOT broadcast"); 382 return; 383 } 384 Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent); 385 } 386 } 387 388 /** Monitor for checking the availability of binder threads. The monitor will block until 389 * there is a binder thread available to process in coming IPCs to make sure other processes 390 * can still communicate with the service. 391 */ 392 private static final class BinderThreadMonitor implements Watchdog.Monitor { 393 @Override monitor()394 public void monitor() { 395 Binder.blockUntilThreadAvailable(); 396 } 397 } 398 399 public interface Monitor { monitor()400 void monitor(); 401 } 402 getInstance()403 public static Watchdog getInstance() { 404 if (sWatchdog == null) { 405 sWatchdog = new Watchdog(); 406 } 407 408 return sWatchdog; 409 } 410 Watchdog()411 private Watchdog() { 412 mThread = new Thread(this::run, "watchdog"); 413 414 // Initialize handler checkers for each common thread we want to check. Note 415 // that we are not currently checking the background thread, since it can 416 // potentially hold longer running operations with no guarantees about the timeliness 417 // of operations there. 418 // 419 // Use a custom thread to check monitors to avoid lock contention from impacted other 420 // threads. 421 ServiceThread t = new ServiceThread("watchdog.monitor", 422 android.os.Process.THREAD_PRIORITY_DEFAULT, true /*allowIo*/); 423 t.start(); 424 mMonitorChecker = new HandlerChecker(new Handler(t.getLooper()), "monitor thread"); 425 mHandlerCheckers.add(withDefaultTimeout(mMonitorChecker)); 426 427 mHandlerCheckers.add(withDefaultTimeout( 428 new HandlerChecker(FgThread.getHandler(), "foreground thread"))); 429 // Add checker for main thread. We only do a quick check since there 430 // can be UI running on the thread. 431 mHandlerCheckers.add(withDefaultTimeout( 432 new HandlerChecker(new Handler(Looper.getMainLooper()), "main thread"))); 433 // Add checker for shared UI thread. 434 mHandlerCheckers.add(withDefaultTimeout( 435 new HandlerChecker(UiThread.getHandler(), "ui thread"))); 436 // And also check IO thread. 437 mHandlerCheckers.add(withDefaultTimeout( 438 new HandlerChecker(IoThread.getHandler(), "i/o thread"))); 439 // And the display thread. 440 mHandlerCheckers.add(withDefaultTimeout( 441 new HandlerChecker(DisplayThread.getHandler(), "display thread"))); 442 // And the animation thread. 443 mHandlerCheckers.add(withDefaultTimeout( 444 new HandlerChecker(AnimationThread.getHandler(), "animation thread"))); 445 // And the surface animation thread. 446 mHandlerCheckers.add(withDefaultTimeout( 447 new HandlerChecker(SurfaceAnimationThread.getHandler(), 448 "surface animation thread"))); 449 // Initialize monitor for Binder threads. 450 addMonitor(new BinderThreadMonitor()); 451 452 mInterestingJavaPids.add(Process.myPid()); 453 454 // See the notes on DEFAULT_TIMEOUT. 455 assert DB || 456 DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS; 457 458 mTraceErrorLogger = new TraceErrorLogger(); 459 } 460 461 /** 462 * Called by SystemServer to cause the internal thread to begin execution. 463 */ start()464 public void start() { 465 mThread.start(); 466 } 467 468 /** 469 * Registers a {@link BroadcastReceiver} to listen to reboot broadcasts and trigger reboot. 470 * Should be called during boot after the ActivityManagerService is up and registered 471 * as a system service so it can handle registration of a {@link BroadcastReceiver}. 472 */ init(Context context, ActivityManagerService activity)473 public void init(Context context, ActivityManagerService activity) { 474 mActivity = activity; 475 context.registerReceiver(new RebootRequestReceiver(), 476 new IntentFilter(Intent.ACTION_REBOOT), 477 android.Manifest.permission.REBOOT, null); 478 } 479 480 private static class SettingsObserver extends ContentObserver { 481 private final Uri mUri = Settings.Global.getUriFor(Settings.Global.WATCHDOG_TIMEOUT_MILLIS); 482 private final Context mContext; 483 private final Watchdog mWatchdog; 484 SettingsObserver(Context context, Watchdog watchdog)485 SettingsObserver(Context context, Watchdog watchdog) { 486 super(BackgroundThread.getHandler()); 487 mContext = context; 488 mWatchdog = watchdog; 489 // Always kick once to ensure that we match current state 490 onChange(); 491 } 492 493 @Override onChange(boolean selfChange, Uri uri, int userId)494 public void onChange(boolean selfChange, Uri uri, int userId) { 495 if (mUri.equals(uri)) { 496 onChange(); 497 } 498 } 499 onChange()500 public void onChange() { 501 try { 502 mWatchdog.updateWatchdogTimeout(Settings.Global.getLong( 503 mContext.getContentResolver(), 504 Settings.Global.WATCHDOG_TIMEOUT_MILLIS, DEFAULT_TIMEOUT)); 505 } catch (RuntimeException e) { 506 Slog.e(TAG, "Exception while reading settings " + e.getMessage(), e); 507 } 508 } 509 } 510 511 /** 512 * Register an observer to listen to settings. 513 * 514 * It needs to be called after the settings service is initialized. 515 */ registerSettingsObserver(Context context)516 public void registerSettingsObserver(Context context) { 517 context.getContentResolver().registerContentObserver( 518 Settings.Global.getUriFor(Settings.Global.WATCHDOG_TIMEOUT_MILLIS), 519 false, 520 new SettingsObserver(context, this), 521 UserHandle.USER_SYSTEM); 522 } 523 524 /** 525 * Updates watchdog timeout values. 526 */ updateWatchdogTimeout(long timeoutMillis)527 void updateWatchdogTimeout(long timeoutMillis) { 528 // See the notes on DEFAULT_TIMEOUT. 529 if (!DB && timeoutMillis <= ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS) { 530 timeoutMillis = ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS + 1; 531 } 532 mWatchdogTimeoutMillis = timeoutMillis; 533 Slog.i(TAG, "Watchdog timeout updated to " + mWatchdogTimeoutMillis + " millis"); 534 } 535 isInterestingJavaProcess(String processName)536 private static boolean isInterestingJavaProcess(String processName) { 537 return processName.equals(StorageManagerService.sMediaStoreAuthorityProcessName) 538 || processName.equals("com.android.phone"); 539 } 540 541 /** 542 * Notifies the watchdog when a Java process with {@code pid} is started. 543 * This process may have its stack trace dumped during an ANR. 544 */ processStarted(String processName, int pid)545 public void processStarted(String processName, int pid) { 546 if (isInterestingJavaProcess(processName)) { 547 Slog.i(TAG, "Interesting Java process " + processName + " started. Pid " + pid); 548 synchronized (mLock) { 549 mInterestingJavaPids.add(pid); 550 } 551 } 552 } 553 554 /** 555 * Notifies the watchdog when a Java process with {@code pid} dies. 556 */ processDied(String processName, int pid)557 public void processDied(String processName, int pid) { 558 if (isInterestingJavaProcess(processName)) { 559 Slog.i(TAG, "Interesting Java process " + processName + " died. Pid " + pid); 560 synchronized (mLock) { 561 mInterestingJavaPids.remove(Integer.valueOf(pid)); 562 } 563 } 564 } 565 setActivityController(IActivityController controller)566 public void setActivityController(IActivityController controller) { 567 synchronized (mLock) { 568 mController = controller; 569 } 570 } 571 setAllowRestart(boolean allowRestart)572 public void setAllowRestart(boolean allowRestart) { 573 synchronized (mLock) { 574 mAllowRestart = allowRestart; 575 } 576 } 577 addMonitor(Monitor monitor)578 public void addMonitor(Monitor monitor) { 579 synchronized (mLock) { 580 mMonitorChecker.addMonitorLocked(monitor); 581 } 582 } 583 addThread(Handler thread)584 public void addThread(Handler thread) { 585 synchronized (mLock) { 586 final String name = thread.getLooper().getThread().getName(); 587 mHandlerCheckers.add(withDefaultTimeout(new HandlerChecker(thread, name))); 588 } 589 } 590 addThread(Handler thread, long timeoutMillis)591 public void addThread(Handler thread, long timeoutMillis) { 592 synchronized (mLock) { 593 final String name = thread.getLooper().getThread().getName(); 594 mHandlerCheckers.add( 595 withCustomTimeout(new HandlerChecker(thread, name), timeoutMillis)); 596 } 597 } 598 599 /** 600 * Pauses Watchdog action for the currently running thread. Useful before executing long running 601 * operations that could falsely trigger the watchdog. Each call to this will require a matching 602 * call to {@link #resumeWatchingCurrentThread}. 603 * 604 * <p>If the current thread has not been added to the Watchdog, this call is a no-op. 605 * 606 * <p>If the Watchdog is already paused for the current thread, this call adds 607 * adds another pause and will require an additional {@link #resumeCurrentThread} to resume. 608 * 609 * <p>Note: Use with care, as any deadlocks on the current thread will be undetected until all 610 * pauses have been resumed. 611 */ pauseWatchingCurrentThread(String reason)612 public void pauseWatchingCurrentThread(String reason) { 613 synchronized (mLock) { 614 for (HandlerCheckerAndTimeout hc : mHandlerCheckers) { 615 HandlerChecker checker = hc.checker(); 616 if (Thread.currentThread().equals(checker.getThread())) { 617 checker.pauseLocked(reason); 618 } 619 } 620 } 621 } 622 623 /** 624 * Resumes the last pause from {@link #pauseWatchingCurrentThread} for the currently running 625 * thread. 626 * 627 * <p>If the current thread has not been added to the Watchdog, this call is a no-op. 628 * 629 * <p>If the Watchdog action for the current thread is already resumed, this call logs a wtf. 630 * 631 * <p>If all pauses have been resumed, the Watchdog action is finally resumed, otherwise, 632 * the Watchdog action for the current thread remains paused until resume is called at least 633 * as many times as the calls to pause. 634 */ resumeWatchingCurrentThread(String reason)635 public void resumeWatchingCurrentThread(String reason) { 636 synchronized (mLock) { 637 for (HandlerCheckerAndTimeout hc : mHandlerCheckers) { 638 HandlerChecker checker = hc.checker(); 639 if (Thread.currentThread().equals(checker.getThread())) { 640 checker.resumeLocked(reason); 641 } 642 } 643 } 644 } 645 646 /** 647 * Perform a full reboot of the system. 648 */ rebootSystem(String reason)649 void rebootSystem(String reason) { 650 Slog.i(TAG, "Rebooting system because: " + reason); 651 IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE); 652 try { 653 pms.reboot(false, reason, false); 654 } catch (RemoteException ex) { 655 } 656 } 657 evaluateCheckerCompletionLocked()658 private int evaluateCheckerCompletionLocked() { 659 int state = COMPLETED; 660 for (int i=0; i<mHandlerCheckers.size(); i++) { 661 HandlerChecker hc = mHandlerCheckers.get(i).checker(); 662 state = Math.max(state, hc.getCompletionStateLocked()); 663 } 664 return state; 665 } 666 getCheckersWithStateLocked(int completionState)667 private ArrayList<HandlerChecker> getCheckersWithStateLocked(int completionState) { 668 ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>(); 669 for (int i=0; i<mHandlerCheckers.size(); i++) { 670 HandlerChecker hc = mHandlerCheckers.get(i).checker(); 671 if (hc.getCompletionStateLocked() == completionState) { 672 checkers.add(hc); 673 } 674 } 675 return checkers; 676 } 677 describeCheckersLocked(List<HandlerChecker> checkers)678 private String describeCheckersLocked(List<HandlerChecker> checkers) { 679 StringBuilder builder = new StringBuilder(128); 680 for (int i=0; i<checkers.size(); i++) { 681 if (builder.length() > 0) { 682 builder.append(", "); 683 } 684 builder.append(checkers.get(i).describeBlockedStateLocked()); 685 } 686 return builder.toString(); 687 } 688 addInterestingHidlPids(HashSet<Integer> pids)689 private static void addInterestingHidlPids(HashSet<Integer> pids) { 690 try { 691 IServiceManager serviceManager = IServiceManager.getService(); 692 ArrayList<IServiceManager.InstanceDebugInfo> dump = 693 serviceManager.debugDump(); 694 for (IServiceManager.InstanceDebugInfo info : dump) { 695 if (info.pid == IServiceManager.PidConstant.NO_PID) { 696 continue; 697 } 698 699 if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) { 700 continue; 701 } 702 703 pids.add(info.pid); 704 } 705 } catch (RemoteException e) { 706 Log.w(TAG, e); 707 } 708 } 709 addInterestingAidlPids(HashSet<Integer> pids)710 private static void addInterestingAidlPids(HashSet<Integer> pids) { 711 ServiceDebugInfo[] infos = ServiceManager.getServiceDebugInfo(); 712 if (infos == null) return; 713 714 for (ServiceDebugInfo info : infos) { 715 for (String prefix : AIDL_INTERFACE_PREFIXES_OF_INTEREST) { 716 if (info.name.startsWith(prefix)) { 717 pids.add(info.debugPid); 718 } 719 } 720 } 721 } 722 getInterestingNativePids()723 static ArrayList<Integer> getInterestingNativePids() { 724 HashSet<Integer> pids = new HashSet<>(); 725 addInterestingAidlPids(pids); 726 addInterestingHidlPids(pids); 727 728 int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST); 729 if (nativePids != null) { 730 for (int i : nativePids) { 731 pids.add(i); 732 } 733 } 734 735 return new ArrayList<Integer>(pids); 736 } 737 run()738 private void run() { 739 boolean waitedHalf = false; 740 741 while (true) { 742 List<HandlerChecker> blockedCheckers = Collections.emptyList(); 743 String subject = ""; 744 boolean allowRestart = true; 745 int debuggerWasConnected = 0; 746 boolean doWaitedHalfDump = false; 747 // The value of mWatchdogTimeoutMillis might change while we are executing the loop. 748 // We store the current value to use a consistent value for all handlers. 749 final long watchdogTimeoutMillis = mWatchdogTimeoutMillis; 750 final long checkIntervalMillis = watchdogTimeoutMillis / 2; 751 final ArrayList<Integer> pids; 752 synchronized (mLock) { 753 long timeout = checkIntervalMillis; 754 // Make sure we (re)spin the checkers that have become idle within 755 // this wait-and-check interval 756 for (int i=0; i<mHandlerCheckers.size(); i++) { 757 HandlerCheckerAndTimeout hc = mHandlerCheckers.get(i); 758 // We pick the watchdog to apply every time we reschedule the checkers. The 759 // default timeout might have changed since the last run. 760 hc.checker().scheduleCheckLocked(hc.customTimeoutMillis() 761 .orElse(watchdogTimeoutMillis * Build.HW_TIMEOUT_MULTIPLIER)); 762 } 763 764 if (debuggerWasConnected > 0) { 765 debuggerWasConnected--; 766 } 767 768 // NOTE: We use uptimeMillis() here because we do not want to increment the time we 769 // wait while asleep. If the device is asleep then the thing that we are waiting 770 // to timeout on is asleep as well and won't have a chance to run, causing a false 771 // positive on when to kill things. 772 long start = SystemClock.uptimeMillis(); 773 while (timeout > 0) { 774 if (Debug.isDebuggerConnected()) { 775 debuggerWasConnected = 2; 776 } 777 try { 778 mLock.wait(timeout); 779 // Note: mHandlerCheckers and mMonitorChecker may have changed after waiting 780 } catch (InterruptedException e) { 781 Log.wtf(TAG, e); 782 } 783 if (Debug.isDebuggerConnected()) { 784 debuggerWasConnected = 2; 785 } 786 timeout = checkIntervalMillis - (SystemClock.uptimeMillis() - start); 787 } 788 789 final int waitState = evaluateCheckerCompletionLocked(); 790 if (waitState == COMPLETED) { 791 // The monitors have returned; reset 792 waitedHalf = false; 793 continue; 794 } else if (waitState == WAITING) { 795 // still waiting but within their configured intervals; back off and recheck 796 continue; 797 } else if (waitState == WAITED_HALF) { 798 if (!waitedHalf) { 799 Slog.i(TAG, "WAITED_HALF"); 800 waitedHalf = true; 801 // We've waited half, but we'd need to do the stack trace dump w/o the lock. 802 blockedCheckers = getCheckersWithStateLocked(WAITED_HALF); 803 subject = describeCheckersLocked(blockedCheckers); 804 pids = new ArrayList<>(mInterestingJavaPids); 805 doWaitedHalfDump = true; 806 } else { 807 continue; 808 } 809 } else { 810 // something is overdue! 811 blockedCheckers = getCheckersWithStateLocked(OVERDUE); 812 subject = describeCheckersLocked(blockedCheckers); 813 allowRestart = mAllowRestart; 814 pids = new ArrayList<>(mInterestingJavaPids); 815 } 816 } // END synchronized (mLock) 817 818 // If we got here, that means that the system is most likely hung. 819 // 820 // First collect stack traces from all threads of the system process. 821 // 822 // Then, if we reached the full timeout, kill this process so that the system will 823 // restart. If we reached half of the timeout, just log some information and continue. 824 logWatchog(doWaitedHalfDump, subject, pids); 825 826 if (doWaitedHalfDump) { 827 // We have waited for only half of the timeout, we continue to wait for the duration 828 // of the full timeout before killing the process. 829 continue; 830 } 831 832 IActivityController controller; 833 synchronized (mLock) { 834 controller = mController; 835 } 836 if (controller != null) { 837 Slog.i(TAG, "Reporting stuck state to activity controller"); 838 try { 839 Binder.setDumpDisabled("Service dumps disabled due to hung system process."); 840 // 1 = keep waiting, -1 = kill system 841 int res = controller.systemNotResponding(subject); 842 if (res >= 0) { 843 Slog.i(TAG, "Activity controller requested to coninue to wait"); 844 waitedHalf = false; 845 continue; 846 } 847 } catch (RemoteException e) { 848 } 849 } 850 851 // Only kill the process if the debugger is not attached. 852 if (Debug.isDebuggerConnected()) { 853 debuggerWasConnected = 2; 854 } 855 if (debuggerWasConnected >= 2) { 856 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process"); 857 } else if (debuggerWasConnected > 0) { 858 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process"); 859 } else if (!allowRestart) { 860 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process"); 861 } else { 862 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject); 863 WatchdogDiagnostics.diagnoseCheckers(blockedCheckers); 864 Slog.w(TAG, "*** GOODBYE!"); 865 if (!Build.IS_USER && isCrashLoopFound() 866 && !WatchdogProperties.should_ignore_fatal_count().orElse(false)) { 867 breakCrashLoop(); 868 } 869 Process.killProcess(Process.myPid()); 870 System.exit(10); 871 } 872 873 waitedHalf = false; 874 } 875 } 876 logWatchog(boolean halfWatchdog, String subject, ArrayList<Integer> pids)877 private void logWatchog(boolean halfWatchdog, String subject, ArrayList<Integer> pids) { 878 // Get critical event log before logging the half watchdog so that it doesn't 879 // occur in the log. 880 String criticalEvents = 881 CriticalEventLog.getInstance().logLinesForSystemServerTraceFile(); 882 final UUID errorId = mTraceErrorLogger.generateErrorId(); 883 if (mTraceErrorLogger.isAddErrorIdEnabled()) { 884 mTraceErrorLogger.addProcessInfoAndErrorIdToTrace("system_server", Process.myPid(), 885 errorId); 886 mTraceErrorLogger.addSubjectToTrace(subject, errorId); 887 } 888 889 final String dropboxTag; 890 if (halfWatchdog) { 891 dropboxTag = "pre_watchdog"; 892 CriticalEventLog.getInstance().logHalfWatchdog(subject); 893 FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_PRE_WATCHDOG_OCCURRED); 894 } else { 895 dropboxTag = "watchdog"; 896 CriticalEventLog.getInstance().logWatchdog(subject, errorId); 897 EventLog.writeEvent(EventLogTags.WATCHDOG, subject); 898 // Log the atom as early as possible since it is used as a mechanism to trigger 899 // Perfetto. Ideally, the Perfetto trace capture should happen as close to the 900 // point in time when the Watchdog happens as possible. 901 FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_WATCHDOG_OCCURRED, subject); 902 } 903 904 long anrTime = SystemClock.uptimeMillis(); 905 StringBuilder report = new StringBuilder(); 906 report.append(ResourcePressureUtil.currentPsiState()); 907 ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(false); 908 StringWriter tracesFileException = new StringWriter(); 909 final File stack = StackTracesDumpHelper.dumpStackTraces( 910 pids, processCpuTracker, new SparseBooleanArray(), 911 CompletableFuture.completedFuture(getInterestingNativePids()), tracesFileException, 912 subject, criticalEvents, Runnable::run, /* latencyTracker= */null); 913 // Give some extra time to make sure the stack traces get written. 914 // The system's been hanging for a whlie, another second or two won't hurt much. 915 SystemClock.sleep(5000); 916 processCpuTracker.update(); 917 report.append(processCpuTracker.printCurrentState(anrTime)); 918 report.append(tracesFileException.getBuffer()); 919 920 if (!halfWatchdog) { 921 // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the 922 // kernel log 923 doSysRq('w'); 924 doSysRq('l'); 925 } 926 927 // Try to add the error to the dropbox, but assuming that the ActivityManager 928 // itself may be deadlocked. (which has happened, causing this statement to 929 // deadlock and the watchdog as a whole to be ineffective) 930 Thread dropboxThread = new Thread("watchdogWriteToDropbox") { 931 public void run() { 932 // If a watched thread hangs before init() is called, we don't have a 933 // valid mActivity. So we can't log the error to dropbox. 934 if (mActivity != null) { 935 mActivity.addErrorToDropBox( 936 dropboxTag, null, "system_server", null, null, null, 937 null, report.toString(), stack, null, null, null, 938 errorId, null); 939 } 940 } 941 }; 942 dropboxThread.start(); 943 try { 944 dropboxThread.join(2000); // wait up to 2 seconds for it to return. 945 } catch (InterruptedException ignored) { } 946 } 947 doSysRq(char c)948 private void doSysRq(char c) { 949 try { 950 FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger"); 951 sysrq_trigger.write(c); 952 sysrq_trigger.close(); 953 } catch (IOException e) { 954 Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e); 955 } 956 } 957 resetTimeoutHistory()958 private void resetTimeoutHistory() { 959 writeTimeoutHistory(new ArrayList<String>()); 960 } 961 writeTimeoutHistory(Iterable<String> crashHistory)962 private void writeTimeoutHistory(Iterable<String> crashHistory) { 963 String data = String.join(",", crashHistory); 964 965 try (FileWriter writer = new FileWriter(TIMEOUT_HISTORY_FILE)) { 966 writer.write(SystemProperties.get("ro.boottime.zygote")); 967 writer.write(":"); 968 writer.write(data); 969 } catch (IOException e) { 970 Slog.e(TAG, "Failed to write file " + TIMEOUT_HISTORY_FILE, e); 971 } 972 } 973 readTimeoutHistory()974 private String[] readTimeoutHistory() { 975 final String[] emptyStringArray = {}; 976 977 try (BufferedReader reader = new BufferedReader(new FileReader(TIMEOUT_HISTORY_FILE))) { 978 String line = reader.readLine(); 979 if (line == null) { 980 return emptyStringArray; 981 } 982 983 String[] data = line.trim().split(":"); 984 String boottime = data.length >= 1 ? data[0] : ""; 985 String history = data.length >= 2 ? data[1] : ""; 986 if (SystemProperties.get("ro.boottime.zygote").equals(boottime) && !history.isEmpty()) { 987 return history.split(","); 988 } else { 989 return emptyStringArray; 990 } 991 } catch (FileNotFoundException e) { 992 return emptyStringArray; 993 } catch (IOException e) { 994 Slog.e(TAG, "Failed to read file " + TIMEOUT_HISTORY_FILE, e); 995 return emptyStringArray; 996 } 997 } 998 hasActiveUsbConnection()999 private boolean hasActiveUsbConnection() { 1000 try { 1001 final String state = FileUtils.readTextFile( 1002 new File("/sys/class/android_usb/android0/state"), 1003 128 /*max*/, null /*ellipsis*/).trim(); 1004 if ("CONFIGURED".equals(state)) { 1005 return true; 1006 } 1007 } catch (IOException e) { 1008 Slog.w(TAG, "Failed to determine if device was on USB", e); 1009 } 1010 return false; 1011 } 1012 isCrashLoopFound()1013 private boolean isCrashLoopFound() { 1014 int fatalCount = WatchdogProperties.fatal_count().orElse(0); 1015 long fatalWindowMs = TimeUnit.SECONDS.toMillis( 1016 WatchdogProperties.fatal_window_seconds().orElse(0)); 1017 if (fatalCount == 0 || fatalWindowMs == 0) { 1018 if (fatalCount != fatalWindowMs) { 1019 Slog.w(TAG, String.format("sysprops '%s' and '%s' should be set or unset together", 1020 PROP_FATAL_LOOP_COUNT, PROP_FATAL_LOOP_WINDOWS_SECS)); 1021 } 1022 return false; 1023 } 1024 1025 // new-history = [last (fatalCount - 1) items in old-history] + [nowMs]. 1026 long nowMs = SystemClock.elapsedRealtime(); // Time since boot including deep sleep. 1027 String[] rawCrashHistory = readTimeoutHistory(); 1028 ArrayList<String> crashHistory = new ArrayList<String>(Arrays.asList(Arrays.copyOfRange( 1029 rawCrashHistory, 1030 Math.max(0, rawCrashHistory.length - fatalCount - 1), 1031 rawCrashHistory.length))); 1032 // Something wrong here. 1033 crashHistory.add(String.valueOf(nowMs)); 1034 writeTimeoutHistory(crashHistory); 1035 1036 // Returns false if the device has an active USB connection. 1037 if (hasActiveUsbConnection()) { 1038 return false; 1039 } 1040 1041 long firstCrashMs; 1042 try { 1043 firstCrashMs = Long.parseLong(crashHistory.get(0)); 1044 } catch (NumberFormatException t) { 1045 Slog.w(TAG, "Failed to parseLong " + crashHistory.get(0), t); 1046 resetTimeoutHistory(); 1047 return false; 1048 } 1049 return crashHistory.size() >= fatalCount && nowMs - firstCrashMs < fatalWindowMs; 1050 } 1051 breakCrashLoop()1052 private void breakCrashLoop() { 1053 try (FileWriter kmsg = new FileWriter("/dev/kmsg_debug", /* append= */ true)) { 1054 kmsg.append("Fatal reset to escape the system_server crashing loop\n"); 1055 } catch (IOException e) { 1056 Slog.w(TAG, "Failed to append to kmsg", e); 1057 } 1058 doSysRq('c'); 1059 } 1060 1061 @Override dump(@onNull PrintWriter pw, @Nullable String[] args)1062 public void dump(@NonNull PrintWriter pw, @Nullable String[] args) { 1063 pw.print("WatchdogTimeoutMillis="); 1064 pw.println(mWatchdogTimeoutMillis); 1065 } 1066 } 1067