Bootstrap

Linux电源管理——系统Suspend/Resume流程

本篇文章主要是自己的学习笔记,主要内容是分析linux系统中设备的Suspend和Resume流程,用到的内核版本为 linux-4.14。

目录

1、Linux 内核的Suspend方法

2、__device_suspend 函数

3、pm_op 函数

4、suspend_enter 函数

5、resume流程


1、Linux 内核的Suspend方法

在 Linux 内核中有三种Suspend 的方法,分别是 Freeze、Standby、Suspend to RAM,在用户空间向 /sys/power/state 文件写入“freeze”、”standby”、”mem”就可以触发相应的Suspend,如下所示,关于这几种 Suspend 的区别如果大家感兴趣可以自行去查阅资料,这里就不在暂开了。

echo "freeze" > /sys/power/state
echo "standby" > /sys/power/state
echo "mem" > /sys/power/state

当执行上面命令会通过 sysfs 陷入到内核,并触发 Suspend ,相应的处理代码如下:

static ssize_t state_store(struct kobject *kobj, struct kobj_attribute *attr,
			   const char *buf, size_t n)
{
	suspend_state_t state;
	int error;

	error = pm_autosleep_lock();
	if (error)
		return error;

	if (pm_autosleep_state() > PM_SUSPEND_ON) {
		error = -EBUSY;
		goto out;
	}

	state = decode_state(buf, n);
	if (state < PM_SUSPEND_MAX) {
		if (state == PM_SUSPEND_MEM)
			state = mem_sleep_current;

		error = pm_suspend(state);
	} else if (state == PM_SUSPEND_MAX) {
		error = hibernate();
	} else {
		error = -EINVAL;
	}

 out:
	pm_autosleep_unlock();
	return error ? error : n;
}

decode_state 函数主要功能是 根据输入的字符串进行相应的匹配,返回值 state  的定义如下:

typedef int __bitwise suspend_state_t;

#define PM_SUSPEND_ON		((__force suspend_state_t) 0)
#define PM_SUSPEND_TO_IDLE	((__force suspend_state_t) 1)
#define PM_SUSPEND_STANDBY	((__force suspend_state_t) 2)
#define PM_SUSPEND_MEM		((__force suspend_state_t) 3)
#define PM_SUSPEND_MIN		PM_SUSPEND_TO_IDLE
#define PM_SUSPEND_MAX		((__force suspend_state_t) 4)

如果 state 满足相关条件就会进入 pm_suspend 函数,该函数定义如下:

int pm_suspend(suspend_state_t state)
{
	int error;

	if (state <= PM_SUSPEND_ON || state >= PM_SUSPEND_MAX)
		return -EINVAL;

	pr_info("suspend entry (%s)\n", mem_sleep_labels[state]);
	error = enter_state(state);
	if (error) {
		suspend_stats.fail++;
		dpm_save_failed_errno(error);
	} else {
		suspend_stats.success++;
	}
	pr_info("suspend exit\n");
	return error;
}

然后再进入 enter_state 函数,定义如下:

static int enter_state(suspend_state_t state)
{
	int error;

	trace_suspend_resume(TPS("suspend_enter"), state, true);
	if (state == PM_SUSPEND_TO_IDLE) {
#ifdef CONFIG_PM_DEBUG
		if (pm_test_level != TEST_NONE && pm_test_level <= TEST_CPUS) {
			pr_warn("Unsupported test mode for suspend to idle, please choose none/freezer/devices/platform.\n");
			return -EAGAIN;
		}
#endif
	} else if (!valid_state(state)) {
		return -EINVAL;
	}
	if (!mutex_trylock(&pm_mutex))
		return -EBUSY;

	if (state == PM_SUSPEND_TO_IDLE)
		s2idle_begin();

#ifndef CONFIG_SUSPEND_SKIP_SYNC
	trace_suspend_resume(TPS("sync_filesystems"), 0, true);
	pr_info("Syncing filesystems ... ");
	sys_sync();
	pr_cont("done.\n");
	trace_suspend_resume(TPS("sync_filesystems"), 0, false);
#endif

	pm_pr_dbg("Preparing system for sleep (%s)\n", mem_sleep_labels[state]);
	pm_suspend_clear_flags();
	error = suspend_prepare(state);
	if (error)
		goto Unlock;

	if (suspend_test(TEST_FREEZER))
		goto Finish;

	trace_suspend_resume(TPS("suspend_enter"), state, false);
	pm_pr_dbg("Suspending system (%s)\n", mem_sleep_labels[state]);
	pm_restrict_gfp_mask();
	error = suspend_devices_and_enter(state);
	pm_restore_gfp_mask();

 Finish:
	events_check_enabled = false;
	pm_pr_dbg("Finishing wakeup.\n");
	suspend_finish();
 Unlock:
	mutex_unlock(&pm_mutex);
	return error;
}

(1)valid_state 函数主要是用来检查平台是否支持该电源状态,该函数的定义如下:

static bool valid_state(suspend_state_t state)
{
	/*
	 * PM_SUSPEND_STANDBY and PM_SUSPEND_MEM states need low level
	 * support and need to be valid to the low level
	 * implementation, no valid callback implies that none are valid.
	 */
	return suspend_ops && suspend_ops->valid && suspend_ops->valid(state);
}

通过注释也能够大概知道该函数的作用,当 state 等于 standby 或者 mem时,则需要调用suspend_ops 中的 valid 回调,通过底层平台代码判断是否支持,关于 valid 回调的实现后续会介绍。

(2)suspend_prepare 函数主要进行 suspend 前的准备,比如 switch console 和 thread freezing,如果失败,则终止 suspend。函数定义如下:

/**
 * suspend_prepare - Prepare for entering system sleep state.
 *
 * Common code run for every system sleep state that can be entered (except for
 * hibernation).  Run suspend notifiers, allocate the "suspend" console and
 * freeze processes.
 */
static int suspend_prepare(suspend_state_t state)
{
	int error, nr_calls = 0;

	if (!sleep_state_supported(state))
		return -EPERM;

	pm_prepare_console();

	error = __pm_notifier_call_chain(PM_SUSPEND_PREPARE, -1, &nr_calls);
	if (error) {
		nr_calls--;
		goto Finish;
	}

	trace_suspend_resume(TPS("freeze_processes"), 0, true);
	error = suspend_freeze_processes();
	trace_suspend_resume(TPS("freeze_processes"), 0, false);
	if (!error)
		return 0;

	suspend_stats.failed_freeze++;
	dpm_save_failed_step(SUSPEND_FREEZE);
 Finish:
	__pm_notifier_call_chain(PM_POST_SUSPEND, nr_calls, NULL);
	pm_restore_console();
	return error;
}

 sleep_state_supported : 检查suspend_ops是否有提供.enter回调,该回调会在后面使用到。

static bool sleep_state_supported(suspend_state_t state)
{
	return state == PM_SUSPEND_TO_IDLE || (suspend_ops && suspend_ops->enter);
}

pm_prepare_console :将当前console切换到一个虚拟console。

__pm_notifier_call_chain :发送开始 suspend 的消息。

suspend_freeze_processes :freeze用户空间进程和一些内核线程。

(3)suspend_devices_and_enter 函数的实现如下:

/**
 * suspend_devices_and_enter - Suspend devices and enter system sleep state.
 * @state: System sleep state to enter.
 */
int suspend_devices_and_enter(suspend_state_t state)
{
	int error;
	bool wakeup = false;

	if (!sleep_state_supported(state))
		return -ENOSYS;

	pm_suspend_target_state = state;

	error = platform_suspend_begin(state);
	if (error)
		goto Close;

	suspend_console();
	suspend_test_start();
	error = dpm_suspend_start(PMSG_SUSPEND);
	if (error) {
		pr_err("Some devices failed to suspend, or early wake event detected\n");
		goto Recover_platform;
	}
	suspend_test_finish("suspend devices");
	if (suspend_test(TEST_DEVICES))
		goto Recover_platform;

	do {
		error = suspend_enter(state, &wakeup);
	} while (!error && !wakeup && platform_suspend_again(state));

 Resume_devices:
	suspend_test_start();
	dpm_resume_end(PMSG_RESUME);
	suspend_test_finish("resume devices");
	trace_suspend_resume(TPS("resume_console"), state, true);
	resume_console();
	trace_suspend_resume(TPS("resume_console"), state, false);

 Close:
	platform_resume_end(state);
	pm_suspend_target_state = PM_SUSPEND_ON;
	return error;

 Recover_platform:
	platform_recover(state);
	goto Resume_devices;
}

sleep_state_supported : 再次检查suspend_ops是否有提供.enter回调。

platform_suspend_begin :通过suspend_ops 提供了 begin 回调则调用,通知平台代码,让其作相应的处理。

suspend_console :挂起console。

suspend_test_start :记录系统挂起的开始时间点。

dpm_suspend_start:调用所有设备的->prepare和->suspend回调函数

函数的实现如下:

/**
 * dpm_suspend_start - Prepare devices for PM transition and suspend them.
 * @state: PM transition of the system being carried out.
 *
 * Prepare all non-sysdev devices for system PM transition and execute "suspend"
 * callbacks for them.
 */
int dpm_suspend_start(pm_message_t state)
{
	int error;

	error = dpm_prepare(state);
	if (error) {
		suspend_stats.failed_prepare++;
		dpm_save_failed_step(SUSPEND_PREPARE);
	} else
		error = dpm_suspend(state);
	return error;
}

dpm_prepare :对非系统的设备进行prepare,在这个函数之后,设备的子系统就不能再注册了。

dpm_suspend :调用所有非系统设备的 suspend 函数,设备将停止操作。

dpm_suspend 函数的调用流程如下:

int dpm_suspend(pm_message_t state)

==>    error = device_suspend(dev);
==>        return __device_suspend(dev, pm_transition, false);

2、__device_suspend 函数

dpm_suspend函数最后是调用到了__device_suspend 函数,该函数的实现如下:

/**
 * device_suspend - Execute "suspend" callbacks for given device.
 * @dev: Device to handle.
 * @state: PM transition of the system being carried out.
 * @async: If true, the device is being suspended asynchronously.
 */
static int __device_suspend(struct device *dev, pm_message_t state, bool async)
{
	pm_callback_t callback = NULL;
	const char *info = NULL;
	int error = 0;
	DECLARE_DPM_WATCHDOG_ON_STACK(wd);

	TRACE_DEVICE(dev);
	TRACE_SUSPEND(0);

	dpm_wait_for_subordinate(dev, async);

	if (async_error) {
		dev->power.direct_complete = false;
		goto Complete;
	}

	/*
	 * If a device configured to wake up the system from sleep states
	 * has been suspended at run time and there's a resume request pending
	 * for it, this is equivalent to the device signaling wakeup, so the
	 * system suspend operation should be aborted.
	 */
	if (pm_runtime_barrier(dev) && device_may_wakeup(dev))
		pm_wakeup_event(dev, 0);

	if (pm_wakeup_pending()) {
		dev->power.direct_complete = false;
		async_error = -EBUSY;
		goto Complete;
	}

	if (dev->power.syscore)
		goto Complete;

	/* Avoid direct_complete to let wakeup_path propagate. */
	if (device_may_wakeup(dev) || dev->power.wakeup_path)
		dev->power.direct_complete = false;

	if (dev->power.direct_complete) {
		if (pm_runtime_status_suspended(dev)) {
			pm_runtime_disable(dev);
			if (pm_runtime_status_suspended(dev))
				goto Complete;

			pm_runtime_enable(dev);
		}
		dev->power.direct_complete = false;
	}

	dpm_watchdog_set(&wd, dev);
	device_lock(dev);

	if (dev->pm_domain) {
		info = "power domain ";
		callback = pm_op(&dev->pm_domain->ops, state);
		goto Run;
	}

	if (dev->type && dev->type->pm) {
		info = "type ";
		callback = pm_op(dev->type->pm, state);
		goto Run;
	}

	if (dev->class) {
		if (dev->class->pm) {
			info = "class ";
			callback = pm_op(dev->class->pm, state);
			goto Run;
		} else if (dev->class->suspend) {
			pm_dev_dbg(dev, state, "legacy class ");
			error = legacy_suspend(dev, state, dev->class->suspend,
						"legacy class ");
			goto End;
		}
	}

	if (dev->bus) {
		if (dev->bus->pm) {
			info = "bus ";
			callback = pm_op(dev->bus->pm, state);
		} else if (dev->bus->suspend) {
			pm_dev_dbg(dev, state, "legacy bus ");
			error = legacy_suspend(dev, state, dev->bus->suspend,
						"legacy bus ");
			goto End;
		}
	}

 Run:
	if (!callback && dev->driver && dev->driver->pm) {
		info = "driver ";
		callback = pm_op(dev->driver->pm, state);
	}

#ifdef CONFIG_MTK_RAM_CONSOLE
	if (async)
		aee_rr_rec_last_async_func((unsigned long int)callback);
	else
		aee_rr_rec_last_sync_func((unsigned long int)callback);
#endif

	error = dpm_run_callback(callback, dev, state, info);

 End:
	if (!error) {
		struct device *parent = dev->parent;

		dev->power.is_suspended = true;
		if (parent) {
			spin_lock_irq(&parent->power.lock);
			dev->parent->power.direct_complete = false;
			if (dev->power.wakeup_path
			    && !dev->parent->power.ignore_children)
				dev->parent->power.wakeup_path = true;

			spin_unlock_irq(&parent->power.lock);
		}
		dpm_clear_suppliers_direct_complete(dev);
	} else {
		log_suspend_abort_reason("Callback failed on %s in %pS returned %d",
					 dev_name(dev), callback, error);
	}

	device_unlock(dev);
	dpm_watchdog_clear(&wd);

 Complete:
	if (error)
		async_error = error;

	complete_all(&dev->power.completion);
	TRACE_SUSPEND(error);
	return error;
}

通过注释就可以看出这个函数是执行系统中给定设备的 suspend 回调函数。

在旧版本的 linux 内核中,这些callbacks是放在设备模型的已经结构体中,比如struct bus_type、struct device/driver、struct class 等这些数据结构中都会有suspend/resume函数的身影,但这样做就不太具备良好的封装特性,和实用性。

后来就将这些Callbacks封装为一个统一的数据结构,也就是 struct dev_pm_ops ,上层的数据结构只需要包含这个结构即可。该结构体的定义如下:

struct dev_pm_ops {
	int (*prepare)(struct device *dev);
	void (*complete)(struct device *dev);
	int (*suspend)(struct device *dev);
	int (*resume)(struct device *dev);
	int (*freeze)(struct device *dev);
	int (*thaw)(struct device *dev);
	int (*poweroff)(struct device *dev);
	int (*restore)(struct device *dev);
	int (*suspend_late)(struct device *dev);
	int (*resume_early)(struct device *dev);
	int (*freeze_late)(struct device *dev);
	int (*thaw_early)(struct device *dev);
	int (*poweroff_late)(struct device *dev);
	int (*restore_early)(struct device *dev);
	int (*suspend_noirq)(struct device *dev);
	int (*resume_noirq)(struct device *dev);
	int (*freeze_noirq)(struct device *dev);
	int (*thaw_noirq)(struct device *dev);
	int (*poweroff_noirq)(struct device *dev);
	int (*restore_noirq)(struct device *dev);
	int (*runtime_suspend)(struct device *dev);
	int (*runtime_resume)(struct device *dev);
	int (*runtime_idle)(struct device *dev);
};

这里面的 callbacks 都是和具体设备挂钩的,比如 suspend / resume ,callbacks 的实现和具体的设备有很大关系,这就需要工程师在设计Driver的时候,知道这些 callbacks 的使用场景,根据具体的需求进行分析。

回到 __device_suspend 函数中,callback = pm_op() 函数就是用来获取设备相应的回调函数,保存在callback 变量中,调用顺序为

-> struct dev_pm_domain *pm_domain

-> struct device_type *type;

-> struct class *class;

-> struct bus_type *bus;

3、pm_op 函数

通过上面的分析可以知道__device_suspend函数最后是调用了pm_op函数,函数实现如下:

/**
 * pm_op - Return the PM operation appropriate for given PM event.
 * @ops: PM operations to choose from.
 * @state: PM transition of the system being carried out.
 */
static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state)
{
	switch (state.event) {
#ifdef CONFIG_SUSPEND
	case PM_EVENT_SUSPEND:
		return ops->suspend;
	case PM_EVENT_RESUME:
		return ops->resume;
#endif /* CONFIG_SUSPEND */
#ifdef CONFIG_HIBERNATE_CALLBACKS
	case PM_EVENT_FREEZE:
	case PM_EVENT_QUIESCE:
		return ops->freeze;
	case PM_EVENT_HIBERNATE:
		return ops->poweroff;
	case PM_EVENT_THAW:
	case PM_EVENT_RECOVER:
		return ops->thaw;
		break;
	case PM_EVENT_RESTORE:
		return ops->restore;
#endif /* CONFIG_HIBERNATE_CALLBACKS */
	}
	return NULL;
}

这样便得到设备的 suspend callback 函数,然后继续往下可以看到 dpm_run_callback(callback, dev, state, info) 函数,该函数就是运行前面获取的 callback,函数的定义如下:

static int dpm_run_callback(pm_callback_t cb, struct device *dev,
			    pm_message_t state, char *info)
{
	ktime_t calltime;
	int error;

	if (!cb)
		return 0;

    ......
	error = cb(dev);
    ......

	return error;
}

这样就能够执行到 driver 中的 suspend 回调函数。

系统在 suspend/resume 的过程中,会依次调用 prepare —> suspend —> suspend_late —> suspend_noirq —> wakeup —> resume_noirq —> resume_early —> resume。目前就是调用到 suspend 函数,其它的调用流程会在后面体现。

4、suspend_enter 函数

好了,接下来分析其它代码,现在回到suspend_devices_ and_enter 函数中,dpm_suspend_start 函数已经分析完了,现在分析 suspend_enter 函数,该函数的定义如下:

/**
 * suspend_enter - Make the system enter the given sleep state.
 * @state: System sleep state to enter.
 * @wakeup: Returns information that the sleep state should not be re-entered.
 *
 * This function should be called after devices have been suspended.
 */
static int suspend_enter(suspend_state_t state, bool *wakeup)
{
	int error, last_dev;

	error = platform_suspend_prepare(state);
	if (error)
		goto Platform_finish;

	error = dpm_suspend_late(PMSG_SUSPEND);
	if (error) {
		last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
		last_dev %= REC_FAILED_NUM;
		pr_err("late suspend of devices failed\n");
		log_suspend_abort_reason("late suspend of %s device failed",
					 suspend_stats.failed_devs[last_dev]);
		goto Platform_finish;
	}
	error = platform_suspend_prepare_late(state);
	if (error)
		goto Devices_early_resume;

	if (state == PM_SUSPEND_TO_IDLE && pm_test_level != TEST_PLATFORM) {
		s2idle_loop();
		goto Platform_early_resume;
	}

	error = dpm_suspend_noirq(PMSG_SUSPEND);
	if (error) {
		last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
		last_dev %= REC_FAILED_NUM;
		pr_err("noirq suspend of devices failed\n");
		log_suspend_abort_reason("noirq suspend of %s device failed",
					 suspend_stats.failed_devs[last_dev]);
		goto Platform_early_resume;
	}
	error = platform_suspend_prepare_noirq(state);
	if (error)
		goto Platform_wake;

	if (suspend_test(TEST_PLATFORM))
		goto Platform_wake;

	error = disable_nonboot_cpus();
	if (error || suspend_test(TEST_CPUS)) {
		log_suspend_abort_reason("Disabling non-boot cpus failed");
		goto Enable_cpus;
	}

	arch_suspend_disable_irqs();
	BUG_ON(!irqs_disabled());

	error = syscore_suspend();
	if (!error) {
		*wakeup = pm_wakeup_pending();
		if (!(suspend_test(TEST_CORE) || *wakeup)) {
			trace_suspend_resume(TPS("machine_suspend"),
				state, true);
			error = suspend_ops->enter(state);
			trace_suspend_resume(TPS("machine_suspend"),
				state, false);
		} else if (*wakeup) {
			error = -EBUSY;
		}
		syscore_resume();
	}

	arch_suspend_enable_irqs();
	BUG_ON(irqs_disabled());

 Enable_cpus:
	enable_nonboot_cpus();

 Platform_wake:
	platform_resume_noirq(state);
	dpm_resume_noirq(PMSG_RESUME);

 Platform_early_resume:
	platform_resume_early(state);

 Devices_early_resume:
	dpm_resume_early(PMSG_RESUME);

 Platform_finish:
	platform_resume_finish(state);
	return error;
}

platform_suspend_prepare : 检查平台是否提供suspend_ops->prepare() 函数。

dpm_suspend_late:延迟挂起设备,在最后阶段挂起设备。

platform_suspend_prepare_late:准备进入睡眠状态的延迟阶段,执行平台相关的准备操作。

dpm_suspend_noirq:在不需要中断的情况下挂起设备。

platform_suspend_prepare_noirq:准备进入睡眠状态的无中断阶段,执行平台相关的准备操作。

disable_nonboot_cpus:关闭所有非 boot CPU 。

arch_suspend_disable_irqs:关闭全局中断

syscore_suspend:执行系统核心的挂起操作。

pm_wakeup_pending:检查在这段时间内是否有唤醒事件的发生,如果有就要终止suspend

如果前面阶段都一切顺利,则调用suspend_ops->enter(state)回调进行 suspend ,这时系统已经睡过去了完成系统的suspend

5、resume流程

系统的 resume 过程刚好和suspend的流程恰好相反,这里就不展开分析了:

syscore_resume():恢复系统核心。

arch_suspend_enable_irqs():使能中断。

enable_nonboot_cpus():使能非启动的 CPU。

platform_resume_noirq(state):在没有中断的情况下恢复平台。

dpm_resume_noirq(PMSG_RESUME):在没有中断的情况下恢复设备。

platform_resume_early(state):早期恢复平台。

dpm_resume_early(PMSG_RESUME):早期恢复设备。

platform_resume_finish(state):完成平台的恢复操作。

整个系统的 suspend/resume 流程就分析到这里,如果想看suspend_ops->enter函数到底做了什么如果大家感兴趣可以看我的另外一篇文章。

;