1 /*
2  * Copyright (C) 2008 The Android Open Source Project
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *      http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 package com.android.server;
18 
19 import static com.android.server.Watchdog.HandlerCheckerAndTimeout.withCustomTimeout;
20 import static com.android.server.Watchdog.HandlerCheckerAndTimeout.withDefaultTimeout;
21 
22 import android.annotation.NonNull;
23 import android.annotation.Nullable;
24 import android.app.IActivityController;
25 import android.content.BroadcastReceiver;
26 import android.content.Context;
27 import android.content.Intent;
28 import android.content.IntentFilter;
29 import android.database.ContentObserver;
30 import android.hidl.manager.V1_0.IServiceManager;
31 import android.net.Uri;
32 import android.os.Binder;
33 import android.os.Build;
34 import android.os.Debug;
35 import android.os.FileUtils;
36 import android.os.Handler;
37 import android.os.IPowerManager;
38 import android.os.Looper;
39 import android.os.Process;
40 import android.os.RemoteException;
41 import android.os.ServiceDebugInfo;
42 import android.os.ServiceManager;
43 import android.os.SystemClock;
44 import android.os.SystemProperties;
45 import android.os.UserHandle;
46 import android.provider.Settings;
47 import android.sysprop.WatchdogProperties;
48 import android.util.Dumpable;
49 import android.util.EventLog;
50 import android.util.Log;
51 import android.util.Slog;
52 import android.util.SparseBooleanArray;
53 
54 import com.android.internal.os.BackgroundThread;
55 import com.android.internal.os.ProcessCpuTracker;
56 import com.android.internal.os.ZygoteConnectionConstants;
57 import com.android.internal.util.FrameworkStatsLog;
58 import com.android.server.am.ActivityManagerService;
59 import com.android.server.am.StackTracesDumpHelper;
60 import com.android.server.am.TraceErrorLogger;
61 import com.android.server.criticalevents.CriticalEventLog;
62 import com.android.server.wm.SurfaceAnimationThread;
63 
64 import java.io.BufferedReader;
65 import java.io.File;
66 import java.io.FileNotFoundException;
67 import java.io.FileReader;
68 import java.io.FileWriter;
69 import java.io.IOException;
70 import java.io.PrintWriter;
71 import java.io.StringWriter;
72 import java.util.ArrayList;
73 import java.util.Arrays;
74 import java.util.Collections;
75 import java.util.HashSet;
76 import java.util.List;
77 import java.util.Optional;
78 import java.util.UUID;
79 import java.util.concurrent.CompletableFuture;
80 import java.util.concurrent.TimeUnit;
81 
82 /**
83  * This class calls its monitor every minute. Killing this process if they don't return
84  **/
85 public class Watchdog implements Dumpable {
86     static final String TAG = "Watchdog";
87 
88     /** Debug flag. */
89     public static final boolean DEBUG = false;
90 
91     // Set this to true to use debug default values.
92     private static final boolean DB = false;
93 
94     // Note 1: Do not lower this value below thirty seconds without tightening the invoke-with
95     //         timeout in com.android.internal.os.ZygoteConnection, or wrapped applications
96     //         can trigger the watchdog.
97     // Note 2: The debug value is already below the wait time in ZygoteConnection. Wrapped
98     //         applications may not work with a debug build. CTS will fail.
99     private static final long DEFAULT_TIMEOUT = DB ? 10 * 1000 : 60 * 1000;
100 
101     // These are temporally ordered: larger values as lateness increases
102     private static final int COMPLETED = 0;
103     private static final int WAITING = 1;
104     private static final int WAITED_HALF = 2;
105     private static final int OVERDUE = 3;
106 
107     // Track watchdog timeout history and break the crash loop if there is.
108     private static final String TIMEOUT_HISTORY_FILE = "/data/system/watchdog-timeout-history.txt";
109     private static final String PROP_FATAL_LOOP_COUNT = "framework_watchdog.fatal_count";
110     private static final String PROP_FATAL_LOOP_WINDOWS_SECS =
111             "framework_watchdog.fatal_window.second";
112 
113     // Which native processes to dump into dropbox's stack traces
114     public static final String[] NATIVE_STACKS_OF_INTEREST = new String[] {
115         "/system/bin/audioserver",
116         "/system/bin/cameraserver",
117         "/system/bin/drmserver",
118         "/system/bin/keystore2",
119         "/system/bin/mediadrmserver",
120         "/system/bin/mediaserver",
121         "/system/bin/netd",
122         "/system/bin/sdcard",
123         "/system/bin/surfaceflinger",
124         "/system/bin/vold",
125         "media.extractor", // system/bin/mediaextractor
126         "media.metrics", // system/bin/mediametrics
127         "media.codec", // vendor/bin/hw/android.hardware.media.omx@1.0-service
128         "media.swcodec", // /apex/com.android.media.swcodec/bin/mediaswcodec
129         "media.transcoding", // Media transcoding service
130         "com.android.bluetooth",  // Bluetooth service
131         "/apex/com.android.os.statsd/bin/statsd",  // Stats daemon
132     };
133 
134     public static final List<String> HAL_INTERFACES_OF_INTEREST = Arrays.asList(
135             "android.hardware.audio@4.0::IDevicesFactory",
136             "android.hardware.audio@5.0::IDevicesFactory",
137             "android.hardware.audio@6.0::IDevicesFactory",
138             "android.hardware.audio@7.0::IDevicesFactory",
139             "android.hardware.biometrics.face@1.0::IBiometricsFace",
140             "android.hardware.biometrics.fingerprint@2.1::IBiometricsFingerprint",
141             "android.hardware.bluetooth@1.0::IBluetoothHci",
142             "android.hardware.camera.provider@2.4::ICameraProvider",
143             "android.hardware.gnss@1.0::IGnss",
144             "android.hardware.graphics.allocator@2.0::IAllocator",
145             "android.hardware.graphics.composer@2.1::IComposer",
146             "android.hardware.health@2.0::IHealth",
147             "android.hardware.light@2.0::ILight",
148             "android.hardware.media.c2@1.0::IComponentStore",
149             "android.hardware.media.omx@1.0::IOmx",
150             "android.hardware.media.omx@1.0::IOmxStore",
151             "android.hardware.neuralnetworks@1.0::IDevice",
152             "android.hardware.power@1.0::IPower",
153             "android.hardware.power.stats@1.0::IPowerStats",
154             "android.hardware.sensors@1.0::ISensors",
155             "android.hardware.sensors@2.0::ISensors",
156             "android.hardware.sensors@2.1::ISensors",
157             "android.hardware.vibrator@1.0::IVibrator",
158             "android.hardware.vr@1.0::IVr",
159             "android.system.suspend@1.0::ISystemSuspend"
160     );
161 
162     public static final String[] AIDL_INTERFACE_PREFIXES_OF_INTEREST = new String[] {
163             "android.hardware.audio.core.IModule/",
164             "android.hardware.audio.core.IConfig/",
165             "android.hardware.biometrics.face.IFace/",
166             "android.hardware.biometrics.fingerprint.IFingerprint/",
167             "android.hardware.bluetooth.IBluetoothHci/",
168             "android.hardware.camera.provider.ICameraProvider/",
169             "android.hardware.gnss.IGnss/",
170             "android.hardware.graphics.allocator.IAllocator/",
171             "android.hardware.graphics.composer3.IComposer/",
172             "android.hardware.health.IHealth/",
173             "android.hardware.input.processor.IInputProcessor/",
174             "android.hardware.light.ILights/",
175             "android.hardware.neuralnetworks.IDevice/",
176             "android.hardware.power.IPower/",
177             "android.hardware.power.stats.IPowerStats/",
178             "android.hardware.sensors.ISensors/",
179             "android.hardware.vibrator.IVibrator/",
180             "android.hardware.vibrator.IVibratorManager/",
181             "android.system.suspend.ISystemSuspend/",
182     };
183 
184     private static Watchdog sWatchdog;
185 
186     private final Thread mThread;
187 
188     private final Object mLock = new Object();
189 
190     /* This handler will be used to post message back onto the main thread */
191     private final ArrayList<HandlerCheckerAndTimeout> mHandlerCheckers = new ArrayList<>();
192     private final HandlerChecker mMonitorChecker;
193     private ActivityManagerService mActivity;
194     private IActivityController mController;
195     private boolean mAllowRestart = true;
196     // We start with DEFAULT_TIMEOUT. This will then be update with the timeout values from Settings
197     // once the settings provider is initialized.
198     private volatile long mWatchdogTimeoutMillis = DEFAULT_TIMEOUT;
199     private final List<Integer> mInterestingJavaPids = new ArrayList<>();
200     private final TraceErrorLogger mTraceErrorLogger;
201 
202     /** Holds a checker and its timeout. */
203     static final class HandlerCheckerAndTimeout {
204         private final HandlerChecker mHandler;
205         private final Optional<Long> mCustomTimeoutMillis;
206 
HandlerCheckerAndTimeout(HandlerChecker checker, Optional<Long> timeoutMillis)207         private HandlerCheckerAndTimeout(HandlerChecker checker, Optional<Long> timeoutMillis) {
208             this.mHandler = checker;
209             this.mCustomTimeoutMillis = timeoutMillis;
210         }
211 
checker()212         HandlerChecker checker() {
213             return mHandler;
214         }
215 
216         /** Returns the timeout. */
customTimeoutMillis()217         Optional<Long> customTimeoutMillis() {
218             return mCustomTimeoutMillis;
219         }
220 
221         /**
222          * Creates a checker with the default timeout. The timeout will use the default value which
223          * is configurable server-side.
224          */
withDefaultTimeout(HandlerChecker checker)225         static HandlerCheckerAndTimeout withDefaultTimeout(HandlerChecker checker) {
226             return new HandlerCheckerAndTimeout(checker, Optional.empty());
227         }
228 
229         /**
230          * Creates a checker with a custom timeout. The timeout overrides the default value and will
231          * always be used.
232          */
withCustomTimeout( HandlerChecker checker, long timeoutMillis)233         static HandlerCheckerAndTimeout withCustomTimeout(
234                 HandlerChecker checker, long timeoutMillis) {
235             return new HandlerCheckerAndTimeout(checker, Optional.of(timeoutMillis));
236         }
237     }
238 
239     /**
240      * Used for checking status of handle threads and scheduling monitor callbacks.
241      */
242     public final class HandlerChecker implements Runnable {
243         private final Handler mHandler;
244         private final String mName;
245         private final ArrayList<Monitor> mMonitors = new ArrayList<Monitor>();
246         private final ArrayList<Monitor> mMonitorQueue = new ArrayList<Monitor>();
247         private long mWaitMaxMillis;
248         private boolean mCompleted;
249         private Monitor mCurrentMonitor;
250         private long mStartTimeMillis;
251         private int mPauseCount;
252 
HandlerChecker(Handler handler, String name)253         HandlerChecker(Handler handler, String name) {
254             mHandler = handler;
255             mName = name;
256             mCompleted = true;
257         }
258 
addMonitorLocked(Monitor monitor)259         void addMonitorLocked(Monitor monitor) {
260             // We don't want to update mMonitors when the Handler is in the middle of checking
261             // all monitors. We will update mMonitors on the next schedule if it is safe
262             mMonitorQueue.add(monitor);
263         }
264 
265         /**
266          * Schedules a run on the handler thread.
267          *
268          * @param handlerCheckerTimeoutMillis the timeout to use for this run
269          */
scheduleCheckLocked(long handlerCheckerTimeoutMillis)270         public void scheduleCheckLocked(long handlerCheckerTimeoutMillis) {
271             mWaitMaxMillis = handlerCheckerTimeoutMillis;
272             if (mCompleted) {
273                 // Safe to update monitors in queue, Handler is not in the middle of work
274                 mMonitors.addAll(mMonitorQueue);
275                 mMonitorQueue.clear();
276             }
277             if ((mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling())
278                     || (mPauseCount > 0)) {
279                 // Don't schedule until after resume OR
280                 // If the target looper has recently been polling, then
281                 // there is no reason to enqueue our checker on it since that
282                 // is as good as it not being deadlocked.  This avoid having
283                 // to do a context switch to check the thread. Note that we
284                 // only do this if we have no monitors since those would need to
285                 // be executed at this point.
286                 mCompleted = true;
287                 return;
288             }
289             if (!mCompleted) {
290                 // we already have a check in flight, so no need
291                 return;
292             }
293 
294             mCompleted = false;
295             mCurrentMonitor = null;
296             mStartTimeMillis = SystemClock.uptimeMillis();
297             mHandler.postAtFrontOfQueue(this);
298         }
299 
getCompletionStateLocked()300         public int getCompletionStateLocked() {
301             if (mCompleted) {
302                 return COMPLETED;
303             } else {
304                 long latency = SystemClock.uptimeMillis() - mStartTimeMillis;
305                 if (latency < mWaitMaxMillis / 2) {
306                     return WAITING;
307                 } else if (latency < mWaitMaxMillis) {
308                     return WAITED_HALF;
309                 }
310             }
311             return OVERDUE;
312         }
313 
getThread()314         public Thread getThread() {
315             return mHandler.getLooper().getThread();
316         }
317 
getName()318         public String getName() {
319             return mName;
320         }
321 
describeBlockedStateLocked()322         String describeBlockedStateLocked() {
323             final String prefix;
324             if (mCurrentMonitor == null) {
325                 prefix = "Blocked in handler on ";
326             } else {
327                 prefix =  "Blocked in monitor " + mCurrentMonitor.getClass().getName();
328             }
329             long latencySeconds = (SystemClock.uptimeMillis() - mStartTimeMillis) / 1000;
330             return prefix + " on " + mName + " (" + getThread().getName() + ")"
331                 + " for " + latencySeconds + "s";
332         }
333 
334         @Override
run()335         public void run() {
336             // Once we get here, we ensure that mMonitors does not change even if we call
337             // #addMonitorLocked because we first add the new monitors to mMonitorQueue and
338             // move them to mMonitors on the next schedule when mCompleted is true, at which
339             // point we have completed execution of this method.
340             final int size = mMonitors.size();
341             for (int i = 0 ; i < size ; i++) {
342                 synchronized (mLock) {
343                     mCurrentMonitor = mMonitors.get(i);
344                 }
345                 mCurrentMonitor.monitor();
346             }
347 
348             synchronized (mLock) {
349                 mCompleted = true;
350                 mCurrentMonitor = null;
351             }
352         }
353 
354         /** Pause the HandlerChecker. */
pauseLocked(String reason)355         public void pauseLocked(String reason) {
356             mPauseCount++;
357             // Mark as completed, because there's a chance we called this after the watchog
358             // thread loop called Object#wait after 'WAITED_HALF'. In that case we want to ensure
359             // the next call to #getCompletionStateLocked for this checker returns 'COMPLETED'
360             mCompleted = true;
361             Slog.i(TAG, "Pausing HandlerChecker: " + mName + " for reason: "
362                     + reason + ". Pause count: " + mPauseCount);
363         }
364 
365         /** Resume the HandlerChecker from the last {@link #pauseLocked}. */
resumeLocked(String reason)366         public void resumeLocked(String reason) {
367             if (mPauseCount > 0) {
368                 mPauseCount--;
369                 Slog.i(TAG, "Resuming HandlerChecker: " + mName + " for reason: "
370                         + reason + ". Pause count: " + mPauseCount);
371             } else {
372                 Slog.wtf(TAG, "Already resumed HandlerChecker: " + mName);
373             }
374         }
375     }
376 
377     final class RebootRequestReceiver extends BroadcastReceiver {
378         @Override
onReceive(Context c, Intent intent)379         public void onReceive(Context c, Intent intent) {
380             if (intent.getIntExtra("nowait", 0) != 0) {
381                 rebootSystem("Received ACTION_REBOOT broadcast");
382                 return;
383             }
384             Slog.w(TAG, "Unsupported ACTION_REBOOT broadcast: " + intent);
385         }
386     }
387 
388     /** Monitor for checking the availability of binder threads. The monitor will block until
389      * there is a binder thread available to process in coming IPCs to make sure other processes
390      * can still communicate with the service.
391      */
392     private static final class BinderThreadMonitor implements Watchdog.Monitor {
393         @Override
monitor()394         public void monitor() {
395             Binder.blockUntilThreadAvailable();
396         }
397     }
398 
399     public interface Monitor {
monitor()400         void monitor();
401     }
402 
getInstance()403     public static Watchdog getInstance() {
404         if (sWatchdog == null) {
405             sWatchdog = new Watchdog();
406         }
407 
408         return sWatchdog;
409     }
410 
Watchdog()411     private Watchdog() {
412         mThread = new Thread(this::run, "watchdog");
413 
414         // Initialize handler checkers for each common thread we want to check.  Note
415         // that we are not currently checking the background thread, since it can
416         // potentially hold longer running operations with no guarantees about the timeliness
417         // of operations there.
418         //
419         // Use a custom thread to check monitors to avoid lock contention from impacted other
420         // threads.
421         ServiceThread t = new ServiceThread("watchdog.monitor",
422                 android.os.Process.THREAD_PRIORITY_DEFAULT, true /*allowIo*/);
423         t.start();
424         mMonitorChecker = new HandlerChecker(new Handler(t.getLooper()), "monitor thread");
425         mHandlerCheckers.add(withDefaultTimeout(mMonitorChecker));
426 
427         mHandlerCheckers.add(withDefaultTimeout(
428                 new HandlerChecker(FgThread.getHandler(), "foreground thread")));
429         // Add checker for main thread.  We only do a quick check since there
430         // can be UI running on the thread.
431         mHandlerCheckers.add(withDefaultTimeout(
432                 new HandlerChecker(new Handler(Looper.getMainLooper()), "main thread")));
433         // Add checker for shared UI thread.
434         mHandlerCheckers.add(withDefaultTimeout(
435                 new HandlerChecker(UiThread.getHandler(), "ui thread")));
436         // And also check IO thread.
437         mHandlerCheckers.add(withDefaultTimeout(
438                 new HandlerChecker(IoThread.getHandler(), "i/o thread")));
439         // And the display thread.
440         mHandlerCheckers.add(withDefaultTimeout(
441                 new HandlerChecker(DisplayThread.getHandler(), "display thread")));
442         // And the animation thread.
443         mHandlerCheckers.add(withDefaultTimeout(
444                  new HandlerChecker(AnimationThread.getHandler(), "animation thread")));
445         // And the surface animation thread.
446         mHandlerCheckers.add(withDefaultTimeout(
447                 new HandlerChecker(SurfaceAnimationThread.getHandler(),
448                     "surface animation thread")));
449         // Initialize monitor for Binder threads.
450         addMonitor(new BinderThreadMonitor());
451 
452         mInterestingJavaPids.add(Process.myPid());
453 
454         // See the notes on DEFAULT_TIMEOUT.
455         assert DB ||
456                 DEFAULT_TIMEOUT > ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS;
457 
458         mTraceErrorLogger = new TraceErrorLogger();
459     }
460 
461     /**
462      * Called by SystemServer to cause the internal thread to begin execution.
463      */
start()464     public void start() {
465         mThread.start();
466     }
467 
468     /**
469      * Registers a {@link BroadcastReceiver} to listen to reboot broadcasts and trigger reboot.
470      * Should be called during boot after the ActivityManagerService is up and registered
471      * as a system service so it can handle registration of a {@link BroadcastReceiver}.
472      */
init(Context context, ActivityManagerService activity)473     public void init(Context context, ActivityManagerService activity) {
474         mActivity = activity;
475         context.registerReceiver(new RebootRequestReceiver(),
476                 new IntentFilter(Intent.ACTION_REBOOT),
477                 android.Manifest.permission.REBOOT, null);
478     }
479 
480     private static class SettingsObserver extends ContentObserver {
481         private final Uri mUri = Settings.Global.getUriFor(Settings.Global.WATCHDOG_TIMEOUT_MILLIS);
482         private final Context mContext;
483         private final Watchdog mWatchdog;
484 
SettingsObserver(Context context, Watchdog watchdog)485         SettingsObserver(Context context, Watchdog watchdog) {
486             super(BackgroundThread.getHandler());
487             mContext = context;
488             mWatchdog = watchdog;
489             // Always kick once to ensure that we match current state
490             onChange();
491         }
492 
493         @Override
onChange(boolean selfChange, Uri uri, int userId)494         public void onChange(boolean selfChange, Uri uri, int userId) {
495             if (mUri.equals(uri)) {
496                 onChange();
497             }
498         }
499 
onChange()500         public void onChange() {
501             try {
502                 mWatchdog.updateWatchdogTimeout(Settings.Global.getLong(
503                         mContext.getContentResolver(),
504                         Settings.Global.WATCHDOG_TIMEOUT_MILLIS, DEFAULT_TIMEOUT));
505             } catch (RuntimeException e) {
506                 Slog.e(TAG, "Exception while reading settings " + e.getMessage(), e);
507             }
508         }
509     }
510 
511     /**
512      * Register an observer to listen to settings.
513      *
514      * It needs to be called after the settings service is initialized.
515      */
registerSettingsObserver(Context context)516     public void registerSettingsObserver(Context context) {
517         context.getContentResolver().registerContentObserver(
518                 Settings.Global.getUriFor(Settings.Global.WATCHDOG_TIMEOUT_MILLIS),
519                 false,
520                 new SettingsObserver(context, this),
521                 UserHandle.USER_SYSTEM);
522     }
523 
524     /**
525      * Updates watchdog timeout values.
526      */
updateWatchdogTimeout(long timeoutMillis)527     void updateWatchdogTimeout(long timeoutMillis) {
528         // See the notes on DEFAULT_TIMEOUT.
529         if (!DB && timeoutMillis <= ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS) {
530             timeoutMillis = ZygoteConnectionConstants.WRAPPED_PID_TIMEOUT_MILLIS + 1;
531         }
532         mWatchdogTimeoutMillis = timeoutMillis;
533         Slog.i(TAG, "Watchdog timeout updated to " + mWatchdogTimeoutMillis + " millis");
534     }
535 
isInterestingJavaProcess(String processName)536     private static boolean isInterestingJavaProcess(String processName) {
537         return processName.equals(StorageManagerService.sMediaStoreAuthorityProcessName)
538                 || processName.equals("com.android.phone");
539     }
540 
541     /**
542      * Notifies the watchdog when a Java process with {@code pid} is started.
543      * This process may have its stack trace dumped during an ANR.
544      */
processStarted(String processName, int pid)545     public void processStarted(String processName, int pid) {
546         if (isInterestingJavaProcess(processName)) {
547             Slog.i(TAG, "Interesting Java process " + processName + " started. Pid " + pid);
548             synchronized (mLock) {
549                 mInterestingJavaPids.add(pid);
550             }
551         }
552     }
553 
554     /**
555      * Notifies the watchdog when a Java process with {@code pid} dies.
556      */
processDied(String processName, int pid)557     public void processDied(String processName, int pid) {
558         if (isInterestingJavaProcess(processName)) {
559             Slog.i(TAG, "Interesting Java process " + processName + " died. Pid " + pid);
560             synchronized (mLock) {
561                 mInterestingJavaPids.remove(Integer.valueOf(pid));
562             }
563         }
564     }
565 
setActivityController(IActivityController controller)566     public void setActivityController(IActivityController controller) {
567         synchronized (mLock) {
568             mController = controller;
569         }
570     }
571 
setAllowRestart(boolean allowRestart)572     public void setAllowRestart(boolean allowRestart) {
573         synchronized (mLock) {
574             mAllowRestart = allowRestart;
575         }
576     }
577 
addMonitor(Monitor monitor)578     public void addMonitor(Monitor monitor) {
579         synchronized (mLock) {
580             mMonitorChecker.addMonitorLocked(monitor);
581         }
582     }
583 
addThread(Handler thread)584     public void addThread(Handler thread) {
585         synchronized (mLock) {
586             final String name = thread.getLooper().getThread().getName();
587             mHandlerCheckers.add(withDefaultTimeout(new HandlerChecker(thread, name)));
588         }
589     }
590 
addThread(Handler thread, long timeoutMillis)591     public void addThread(Handler thread, long timeoutMillis) {
592         synchronized (mLock) {
593             final String name = thread.getLooper().getThread().getName();
594             mHandlerCheckers.add(
595                     withCustomTimeout(new HandlerChecker(thread, name), timeoutMillis));
596         }
597     }
598 
599     /**
600      * Pauses Watchdog action for the currently running thread. Useful before executing long running
601      * operations that could falsely trigger the watchdog. Each call to this will require a matching
602      * call to {@link #resumeWatchingCurrentThread}.
603      *
604      * <p>If the current thread has not been added to the Watchdog, this call is a no-op.
605      *
606      * <p>If the Watchdog is already paused for the current thread, this call adds
607      * adds another pause and will require an additional {@link #resumeCurrentThread} to resume.
608      *
609      * <p>Note: Use with care, as any deadlocks on the current thread will be undetected until all
610      * pauses have been resumed.
611      */
pauseWatchingCurrentThread(String reason)612     public void pauseWatchingCurrentThread(String reason) {
613         synchronized (mLock) {
614             for (HandlerCheckerAndTimeout hc : mHandlerCheckers) {
615                 HandlerChecker checker = hc.checker();
616                 if (Thread.currentThread().equals(checker.getThread())) {
617                     checker.pauseLocked(reason);
618                 }
619             }
620         }
621     }
622 
623     /**
624      * Resumes the last pause from {@link #pauseWatchingCurrentThread} for the currently running
625      * thread.
626      *
627      * <p>If the current thread has not been added to the Watchdog, this call is a no-op.
628      *
629      * <p>If the Watchdog action for the current thread is already resumed, this call logs a wtf.
630      *
631      * <p>If all pauses have been resumed, the Watchdog action is finally resumed, otherwise,
632      * the Watchdog action for the current thread remains paused until resume is called at least
633      * as many times as the calls to pause.
634      */
resumeWatchingCurrentThread(String reason)635     public void resumeWatchingCurrentThread(String reason) {
636         synchronized (mLock) {
637             for (HandlerCheckerAndTimeout hc : mHandlerCheckers) {
638                 HandlerChecker checker = hc.checker();
639                 if (Thread.currentThread().equals(checker.getThread())) {
640                     checker.resumeLocked(reason);
641                 }
642             }
643         }
644     }
645 
646     /**
647      * Perform a full reboot of the system.
648      */
rebootSystem(String reason)649     void rebootSystem(String reason) {
650         Slog.i(TAG, "Rebooting system because: " + reason);
651         IPowerManager pms = (IPowerManager)ServiceManager.getService(Context.POWER_SERVICE);
652         try {
653             pms.reboot(false, reason, false);
654         } catch (RemoteException ex) {
655         }
656     }
657 
evaluateCheckerCompletionLocked()658     private int evaluateCheckerCompletionLocked() {
659         int state = COMPLETED;
660         for (int i=0; i<mHandlerCheckers.size(); i++) {
661             HandlerChecker hc = mHandlerCheckers.get(i).checker();
662             state = Math.max(state, hc.getCompletionStateLocked());
663         }
664         return state;
665     }
666 
getCheckersWithStateLocked(int completionState)667     private ArrayList<HandlerChecker> getCheckersWithStateLocked(int completionState) {
668         ArrayList<HandlerChecker> checkers = new ArrayList<HandlerChecker>();
669         for (int i=0; i<mHandlerCheckers.size(); i++) {
670             HandlerChecker hc = mHandlerCheckers.get(i).checker();
671             if (hc.getCompletionStateLocked() == completionState) {
672                 checkers.add(hc);
673             }
674         }
675         return checkers;
676     }
677 
describeCheckersLocked(List<HandlerChecker> checkers)678     private String describeCheckersLocked(List<HandlerChecker> checkers) {
679         StringBuilder builder = new StringBuilder(128);
680         for (int i=0; i<checkers.size(); i++) {
681             if (builder.length() > 0) {
682                 builder.append(", ");
683             }
684             builder.append(checkers.get(i).describeBlockedStateLocked());
685         }
686         return builder.toString();
687     }
688 
addInterestingHidlPids(HashSet<Integer> pids)689     private static void addInterestingHidlPids(HashSet<Integer> pids) {
690         try {
691             IServiceManager serviceManager = IServiceManager.getService();
692             ArrayList<IServiceManager.InstanceDebugInfo> dump =
693                     serviceManager.debugDump();
694             for (IServiceManager.InstanceDebugInfo info : dump) {
695                 if (info.pid == IServiceManager.PidConstant.NO_PID) {
696                     continue;
697                 }
698 
699                 if (!HAL_INTERFACES_OF_INTEREST.contains(info.interfaceName)) {
700                     continue;
701                 }
702 
703                 pids.add(info.pid);
704             }
705         } catch (RemoteException e) {
706             Log.w(TAG, e);
707         }
708     }
709 
addInterestingAidlPids(HashSet<Integer> pids)710     private static void addInterestingAidlPids(HashSet<Integer> pids) {
711         ServiceDebugInfo[] infos = ServiceManager.getServiceDebugInfo();
712         if (infos == null) return;
713 
714         for (ServiceDebugInfo info : infos) {
715             for (String prefix : AIDL_INTERFACE_PREFIXES_OF_INTEREST) {
716                 if (info.name.startsWith(prefix)) {
717                     pids.add(info.debugPid);
718                 }
719             }
720         }
721     }
722 
getInterestingNativePids()723     static ArrayList<Integer> getInterestingNativePids() {
724         HashSet<Integer> pids = new HashSet<>();
725         addInterestingAidlPids(pids);
726         addInterestingHidlPids(pids);
727 
728         int[] nativePids = Process.getPidsForCommands(NATIVE_STACKS_OF_INTEREST);
729         if (nativePids != null) {
730             for (int i : nativePids) {
731                 pids.add(i);
732             }
733         }
734 
735         return new ArrayList<Integer>(pids);
736     }
737 
run()738     private void run() {
739         boolean waitedHalf = false;
740 
741         while (true) {
742             List<HandlerChecker> blockedCheckers = Collections.emptyList();
743             String subject = "";
744             boolean allowRestart = true;
745             int debuggerWasConnected = 0;
746             boolean doWaitedHalfDump = false;
747             // The value of mWatchdogTimeoutMillis might change while we are executing the loop.
748             // We store the current value to use a consistent value for all handlers.
749             final long watchdogTimeoutMillis = mWatchdogTimeoutMillis;
750             final long checkIntervalMillis = watchdogTimeoutMillis / 2;
751             final ArrayList<Integer> pids;
752             synchronized (mLock) {
753                 long timeout = checkIntervalMillis;
754                 // Make sure we (re)spin the checkers that have become idle within
755                 // this wait-and-check interval
756                 for (int i=0; i<mHandlerCheckers.size(); i++) {
757                     HandlerCheckerAndTimeout hc = mHandlerCheckers.get(i);
758                     // We pick the watchdog to apply every time we reschedule the checkers. The
759                     // default timeout might have changed since the last run.
760                     hc.checker().scheduleCheckLocked(hc.customTimeoutMillis()
761                             .orElse(watchdogTimeoutMillis * Build.HW_TIMEOUT_MULTIPLIER));
762                 }
763 
764                 if (debuggerWasConnected > 0) {
765                     debuggerWasConnected--;
766                 }
767 
768                 // NOTE: We use uptimeMillis() here because we do not want to increment the time we
769                 // wait while asleep. If the device is asleep then the thing that we are waiting
770                 // to timeout on is asleep as well and won't have a chance to run, causing a false
771                 // positive on when to kill things.
772                 long start = SystemClock.uptimeMillis();
773                 while (timeout > 0) {
774                     if (Debug.isDebuggerConnected()) {
775                         debuggerWasConnected = 2;
776                     }
777                     try {
778                         mLock.wait(timeout);
779                         // Note: mHandlerCheckers and mMonitorChecker may have changed after waiting
780                     } catch (InterruptedException e) {
781                         Log.wtf(TAG, e);
782                     }
783                     if (Debug.isDebuggerConnected()) {
784                         debuggerWasConnected = 2;
785                     }
786                     timeout = checkIntervalMillis - (SystemClock.uptimeMillis() - start);
787                 }
788 
789                 final int waitState = evaluateCheckerCompletionLocked();
790                 if (waitState == COMPLETED) {
791                     // The monitors have returned; reset
792                     waitedHalf = false;
793                     continue;
794                 } else if (waitState == WAITING) {
795                     // still waiting but within their configured intervals; back off and recheck
796                     continue;
797                 } else if (waitState == WAITED_HALF) {
798                     if (!waitedHalf) {
799                         Slog.i(TAG, "WAITED_HALF");
800                         waitedHalf = true;
801                         // We've waited half, but we'd need to do the stack trace dump w/o the lock.
802                         blockedCheckers = getCheckersWithStateLocked(WAITED_HALF);
803                         subject = describeCheckersLocked(blockedCheckers);
804                         pids = new ArrayList<>(mInterestingJavaPids);
805                         doWaitedHalfDump = true;
806                     } else {
807                         continue;
808                     }
809                 } else {
810                     // something is overdue!
811                     blockedCheckers = getCheckersWithStateLocked(OVERDUE);
812                     subject = describeCheckersLocked(blockedCheckers);
813                     allowRestart = mAllowRestart;
814                     pids = new ArrayList<>(mInterestingJavaPids);
815                 }
816             } // END synchronized (mLock)
817 
818             // If we got here, that means that the system is most likely hung.
819             //
820             // First collect stack traces from all threads of the system process.
821             //
822             // Then, if we reached the full timeout, kill this process so that the system will
823             // restart. If we reached half of the timeout, just log some information and continue.
824             logWatchog(doWaitedHalfDump, subject, pids);
825 
826             if (doWaitedHalfDump) {
827                 // We have waited for only half of the timeout, we continue to wait for the duration
828                 // of the full timeout before killing the process.
829                 continue;
830             }
831 
832             IActivityController controller;
833             synchronized (mLock) {
834                 controller = mController;
835             }
836             if (controller != null) {
837                 Slog.i(TAG, "Reporting stuck state to activity controller");
838                 try {
839                     Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
840                     // 1 = keep waiting, -1 = kill system
841                     int res = controller.systemNotResponding(subject);
842                     if (res >= 0) {
843                         Slog.i(TAG, "Activity controller requested to coninue to wait");
844                         waitedHalf = false;
845                         continue;
846                     }
847                 } catch (RemoteException e) {
848                 }
849             }
850 
851             // Only kill the process if the debugger is not attached.
852             if (Debug.isDebuggerConnected()) {
853                 debuggerWasConnected = 2;
854             }
855             if (debuggerWasConnected >= 2) {
856                 Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
857             } else if (debuggerWasConnected > 0) {
858                 Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
859             } else if (!allowRestart) {
860                 Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
861             } else {
862                 Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
863                 WatchdogDiagnostics.diagnoseCheckers(blockedCheckers);
864                 Slog.w(TAG, "*** GOODBYE!");
865                 if (!Build.IS_USER && isCrashLoopFound()
866                         && !WatchdogProperties.should_ignore_fatal_count().orElse(false)) {
867                     breakCrashLoop();
868                 }
869                 Process.killProcess(Process.myPid());
870                 System.exit(10);
871             }
872 
873             waitedHalf = false;
874         }
875     }
876 
logWatchog(boolean halfWatchdog, String subject, ArrayList<Integer> pids)877     private void logWatchog(boolean halfWatchdog, String subject, ArrayList<Integer> pids) {
878         // Get critical event log before logging the half watchdog so that it doesn't
879         // occur in the log.
880         String criticalEvents =
881                 CriticalEventLog.getInstance().logLinesForSystemServerTraceFile();
882         final UUID errorId = mTraceErrorLogger.generateErrorId();
883         if (mTraceErrorLogger.isAddErrorIdEnabled()) {
884             mTraceErrorLogger.addProcessInfoAndErrorIdToTrace("system_server", Process.myPid(),
885                     errorId);
886             mTraceErrorLogger.addSubjectToTrace(subject, errorId);
887         }
888 
889         final String dropboxTag;
890         if (halfWatchdog) {
891             dropboxTag = "pre_watchdog";
892             CriticalEventLog.getInstance().logHalfWatchdog(subject);
893             FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_PRE_WATCHDOG_OCCURRED);
894         } else {
895             dropboxTag = "watchdog";
896             CriticalEventLog.getInstance().logWatchdog(subject, errorId);
897             EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
898             // Log the atom as early as possible since it is used as a mechanism to trigger
899             // Perfetto. Ideally, the Perfetto trace capture should happen as close to the
900             // point in time when the Watchdog happens as possible.
901             FrameworkStatsLog.write(FrameworkStatsLog.SYSTEM_SERVER_WATCHDOG_OCCURRED, subject);
902         }
903 
904         long anrTime = SystemClock.uptimeMillis();
905         StringBuilder report = new StringBuilder();
906         report.append(ResourcePressureUtil.currentPsiState());
907         ProcessCpuTracker processCpuTracker = new ProcessCpuTracker(false);
908         StringWriter tracesFileException = new StringWriter();
909         final File stack = StackTracesDumpHelper.dumpStackTraces(
910                 pids, processCpuTracker, new SparseBooleanArray(),
911                 CompletableFuture.completedFuture(getInterestingNativePids()), tracesFileException,
912                 subject, criticalEvents, Runnable::run, /* latencyTracker= */null);
913         // Give some extra time to make sure the stack traces get written.
914         // The system's been hanging for a whlie, another second or two won't hurt much.
915         SystemClock.sleep(5000);
916         processCpuTracker.update();
917         report.append(processCpuTracker.printCurrentState(anrTime));
918         report.append(tracesFileException.getBuffer());
919 
920         if (!halfWatchdog) {
921             // Trigger the kernel to dump all blocked threads, and backtraces on all CPUs to the
922             // kernel log
923             doSysRq('w');
924             doSysRq('l');
925         }
926 
927         // Try to add the error to the dropbox, but assuming that the ActivityManager
928         // itself may be deadlocked.  (which has happened, causing this statement to
929         // deadlock and the watchdog as a whole to be ineffective)
930         Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
931                 public void run() {
932                     // If a watched thread hangs before init() is called, we don't have a
933                     // valid mActivity. So we can't log the error to dropbox.
934                     if (mActivity != null) {
935                         mActivity.addErrorToDropBox(
936                                 dropboxTag, null, "system_server", null, null, null,
937                                 null, report.toString(), stack, null, null, null,
938                                 errorId, null);
939                     }
940                 }
941             };
942         dropboxThread.start();
943         try {
944             dropboxThread.join(2000);  // wait up to 2 seconds for it to return.
945         } catch (InterruptedException ignored) { }
946     }
947 
doSysRq(char c)948     private void doSysRq(char c) {
949         try {
950             FileWriter sysrq_trigger = new FileWriter("/proc/sysrq-trigger");
951             sysrq_trigger.write(c);
952             sysrq_trigger.close();
953         } catch (IOException e) {
954             Slog.w(TAG, "Failed to write to /proc/sysrq-trigger", e);
955         }
956     }
957 
resetTimeoutHistory()958     private void resetTimeoutHistory() {
959         writeTimeoutHistory(new ArrayList<String>());
960     }
961 
writeTimeoutHistory(Iterable<String> crashHistory)962     private void writeTimeoutHistory(Iterable<String> crashHistory) {
963         String data = String.join(",", crashHistory);
964 
965         try (FileWriter writer = new FileWriter(TIMEOUT_HISTORY_FILE)) {
966             writer.write(SystemProperties.get("ro.boottime.zygote"));
967             writer.write(":");
968             writer.write(data);
969         } catch (IOException e) {
970             Slog.e(TAG, "Failed to write file " + TIMEOUT_HISTORY_FILE, e);
971         }
972     }
973 
readTimeoutHistory()974     private String[] readTimeoutHistory() {
975         final String[] emptyStringArray = {};
976 
977         try (BufferedReader reader = new BufferedReader(new FileReader(TIMEOUT_HISTORY_FILE))) {
978             String line = reader.readLine();
979             if (line == null) {
980                 return emptyStringArray;
981             }
982 
983             String[] data = line.trim().split(":");
984             String boottime = data.length >= 1 ? data[0] : "";
985             String history = data.length >= 2 ? data[1] : "";
986             if (SystemProperties.get("ro.boottime.zygote").equals(boottime) && !history.isEmpty()) {
987                 return history.split(",");
988             } else {
989                 return emptyStringArray;
990             }
991         } catch (FileNotFoundException e) {
992             return emptyStringArray;
993         } catch (IOException e) {
994             Slog.e(TAG, "Failed to read file " + TIMEOUT_HISTORY_FILE, e);
995             return emptyStringArray;
996         }
997     }
998 
hasActiveUsbConnection()999     private boolean hasActiveUsbConnection() {
1000         try {
1001             final String state = FileUtils.readTextFile(
1002                     new File("/sys/class/android_usb/android0/state"),
1003                     128 /*max*/, null /*ellipsis*/).trim();
1004             if ("CONFIGURED".equals(state)) {
1005                 return true;
1006             }
1007         } catch (IOException e) {
1008             Slog.w(TAG, "Failed to determine if device was on USB", e);
1009         }
1010         return false;
1011     }
1012 
isCrashLoopFound()1013     private boolean isCrashLoopFound() {
1014         int fatalCount = WatchdogProperties.fatal_count().orElse(0);
1015         long fatalWindowMs = TimeUnit.SECONDS.toMillis(
1016                 WatchdogProperties.fatal_window_seconds().orElse(0));
1017         if (fatalCount == 0 || fatalWindowMs == 0) {
1018             if (fatalCount != fatalWindowMs) {
1019                 Slog.w(TAG, String.format("sysprops '%s' and '%s' should be set or unset together",
1020                             PROP_FATAL_LOOP_COUNT, PROP_FATAL_LOOP_WINDOWS_SECS));
1021             }
1022             return false;
1023         }
1024 
1025         // new-history = [last (fatalCount - 1) items in old-history] + [nowMs].
1026         long nowMs = SystemClock.elapsedRealtime(); // Time since boot including deep sleep.
1027         String[] rawCrashHistory = readTimeoutHistory();
1028         ArrayList<String> crashHistory = new ArrayList<String>(Arrays.asList(Arrays.copyOfRange(
1029                         rawCrashHistory,
1030                         Math.max(0, rawCrashHistory.length - fatalCount - 1),
1031                         rawCrashHistory.length)));
1032         // Something wrong here.
1033         crashHistory.add(String.valueOf(nowMs));
1034         writeTimeoutHistory(crashHistory);
1035 
1036         // Returns false if the device has an active USB connection.
1037         if (hasActiveUsbConnection()) {
1038             return false;
1039         }
1040 
1041         long firstCrashMs;
1042         try {
1043             firstCrashMs = Long.parseLong(crashHistory.get(0));
1044         } catch (NumberFormatException t) {
1045             Slog.w(TAG, "Failed to parseLong " + crashHistory.get(0), t);
1046             resetTimeoutHistory();
1047             return false;
1048         }
1049         return crashHistory.size() >= fatalCount && nowMs - firstCrashMs < fatalWindowMs;
1050     }
1051 
breakCrashLoop()1052     private void breakCrashLoop() {
1053         try (FileWriter kmsg = new FileWriter("/dev/kmsg_debug", /* append= */ true)) {
1054             kmsg.append("Fatal reset to escape the system_server crashing loop\n");
1055         } catch (IOException e) {
1056             Slog.w(TAG, "Failed to append to kmsg", e);
1057         }
1058         doSysRq('c');
1059     }
1060 
1061     @Override
dump(@onNull PrintWriter pw, @Nullable String[] args)1062     public void dump(@NonNull PrintWriter pw, @Nullable String[] args) {
1063         pw.print("WatchdogTimeoutMillis=");
1064         pw.println(mWatchdogTimeoutMillis);
1065     }
1066 }
1067