在博文http://t.csdnimg.cn/yMbwW中,对rcu做了基本的分析,不过分析中并没有讲清楚rcu是如何调用注册的callback函数的。这篇文章主要就是来补齐这个缺口。
本文分析基于linux内核4.19.195
首先来看看rcu管理callback的数据结构。
struct callback_head {
struct callback_head *next;//``->next`` 字段用于将 ``rcu_data`` 结构中的列表中的 ``rcu_head`` 结构链接在一起。
void (*func)(struct callback_head *head); //具体的callback函数
} __attribute__((aligned(sizeof(void *))));
#define rcu_head callback_head
struct rcu_segcblist {
struct rcu_head *head; //指向第一个加入链表的rcu_head
struct rcu_head **tails[RCU_CBLIST_NSEGS]; //参考函数rcu_segcblist_enqueue()
unsigned long gp_seq[RCU_CBLIST_NSEGS];//数组记录了与列表段对应的宽限期编号。 这就是允许不同的 CPU 对哪个是当前宽限期有不同的想法,同时仍然避免过早调用它们的回调。 特别是,这允许长时间空闲的 CPU 确定它们的哪些回调已准备好在重新唤醒后调用。
long len; //计数``->head`` 中回调的数量
long len_lazy;
};
其中,rcu_segcblist由如下代码完成初始化
void rcu_segcblist_init(struct rcu_segcblist *rsclp)
{
int i;
BUILD_BUG_ON(RCU_NEXT_TAIL + 1 != ARRAY_SIZE(rsclp->gp_seq));
BUILD_BUG_ON(ARRAY_SIZE(rsclp->tails) != ARRAY_SIZE(rsclp->gp_seq));
rsclp->head = NULL;
for (i = 0; i < RCU_CBLIST_NSEGS; i++)
rsclp->tails[i] = &rsclp->head; //初始化都指向rsclp->head
rsclp->len = 0;
rsclp->len_lazy = 0;
}
linux内核使用callback_head来描述一个callback对象,struct callback_head是链表中的节点,其中next字段指向下一个节点;使用rcu_segcblist结构体来管理所有的callback对象。其中,初始化时,rcu_segcblist里的gp_seq会被初始化为0,而rsp->gp_seq会被初始化为一个很大的接近0xffffffff的数。
struct rcu_segcblist是这个单链表的管理结构,head指向了链表中的第一个节点,而tails字段,指向的是“链表中的某个节点的next字段的地址”,比较绕,但是这样的设计很巧妙,在单链表插入的过程中非常实用。
为什么tails需要有4个呢?让我们参考内核里的注释 。。。。。这4个主要是区分rcu callback的调用时机的,同时也方便在不同的位置完成链表的插入/合并等动作。这个结构体的理解可以参考内核里的注释,写的非常清楚。
同样的,gp_seq数组的长度也是4,分别代表了4个宽限期到期的时间。
下面分析call_rcu函数。
简单来说,就是把callback给挂到单链表上了。
/**
* call_rcu() - Queue an RCU callback for invocation after a grace period.
* @head: structure to be used for queueing the RCU updates.
* @func: actual callback function to be invoked after the grace period
*
* The callback function will be invoked some time after a full grace
* period elapses, in other words after all pre-existing RCU read-side
* critical sections have completed. However, the callback function
* might well execute concurrently with RCU read-side critical sections
* that started after call_rcu() was invoked. RCU read-side critical
* sections are delimited by rcu_read_lock() and rcu_read_unlock(),
* and may be nested.
*
* Note that all CPUs must agree that the grace period extended beyond
* all pre-existing RCU read-side critical section. On systems with more
* than one CPU, this means that when "func()" is invoked, each CPU is
* guaranteed to have executed a full memory barrier since the end of its
* last RCU read-side critical section whose beginning preceded the call
* to call_rcu(). It also means that each CPU executing an RCU read-side
* critical section that continues beyond the start of "func()" must have
* executed a memory barrier after the call_rcu() but before the beginning
* of that RCU read-side critical section. Note that these guarantees
* include CPUs that are offline, idle, or executing in user mode, as
* well as CPUs that are executing in the kernel.
*
* Furthermore, if CPU A invoked call_rcu() and CPU B invoked the
* resulting RCU callback function "func()", then both CPU A and CPU B are
* guaranteed to execute a full memory barrier during the time interval
* between the call to call_rcu() and the invocation of "func()" -- even
* if CPU A and CPU B are the same CPU (but again only if the system has
* more than one CPU).
*/
void call_rcu(struct rcu_head *head, rcu_callback_t func)
{
__call_rcu(head, func, rcu_state_p, -1, 0);
}
EXPORT_SYMBOL_GPL(call_rcu);
/*
* Helper function for call_rcu() and friends. The cpu argument will
* normally be -1, indicating "currently running CPU". It may specify
* a CPU only if that CPU is a no-CBs CPU. Currently, only _rcu_barrier()
* is expected to specify a CPU.
*/
static void
__call_rcu(struct rcu_head *head, rcu_callback_t func,
struct rcu_state *rsp, int cpu, bool lazy)
{
unsigned long flags;
struct rcu_data *rdp;
/* Misaligned rcu_head! */
WARN_ON_ONCE((unsigned long)head & (sizeof(void *) - 1));
if (debug_rcu_head_queue(head)) {
/*
* Probable double call_rcu(), so leak the callback.
* Use rcu:rcu_callback trace event to find the previous
* time callback was passed to __call_rcu().
*/
WARN_ONCE(1, "__call_rcu(): Double-freed CB %p->%pF()!!!\n",
head, head->func);
WRITE_ONCE(head->func, rcu_leak_callback);
return;
}
head->func = func;
head->next = NULL;
local_irq_save(flags);
rdp = this_cpu_ptr(rsp->rda);
/* Add the callback to our list. */
if (unlikely(!rcu_segcblist_is_enabled(&rdp->cblist)) || cpu != -1) {
int offline;
if (cpu != -1)
rdp = per_cpu_ptr(rsp->rda, cpu);
if (likely(rdp->mynode)) {
/* Post-boot, so this should be for a no-CBs CPU. */
offline = !__call_rcu_nocb(rdp, head, lazy, flags);
WARN_ON_ONCE(offline);
/* Offline CPU, _call_rcu() illegal, leak callback. */
local_irq_restore(flags);
return;
}
/*
* Very early boot, before rcu_init(). Initialize if needed
* and then drop through to queue the callback.
*/
BUG_ON(cpu != -1);
WARN_ON_ONCE(!rcu_is_watching());
if (rcu_segcblist_empty(&rdp->cblist))
rcu_segcblist_init(&rdp->cblist);
}
rcu_segcblist_enqueue(&rdp->cblist, head, lazy);
if (!lazy)
rcu_idle_count_callbacks_posted();
if (__is_kfree_rcu_offset((unsigned long)func))
trace_rcu_kfree_callback(rsp->name, head, (unsigned long)func,
rcu_segcblist_n_lazy_cbs(&rdp->cblist),
rcu_segcblist_n_cbs(&rdp->cblist));
else
trace_rcu_callback(rsp->name, head,
rcu_segcblist_n_lazy_cbs(&rdp->cblist),
rcu_segcblist_n_cbs(&rdp->cblist));
/* Go handle any RCU core processing required. */
__call_rcu_core(rsp, rdp, head, flags);
local_irq_restore(flags);
}
void rcu_segcblist_enqueue(struct rcu_segcblist *rsclp,
struct rcu_head *rhp, bool lazy)
{
WRITE_ONCE(rsclp->len, rsclp->len + 1); /* ->len sampled locklessly. */
if (lazy)
rsclp->len_lazy++;
smp_mb(); /* Ensure counts are updated before callback is enqueued. */
rhp->next = NULL;
*rsclp->tails[RCU_NEXT_TAIL] = rhp;//单链表插入
rsclp->tails[RCU_NEXT_TAIL] = &rhp->next;
}
/*
* Handle any core-RCU processing required by a call_rcu() invocation.
*/
static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
struct rcu_head *head, unsigned long flags)
{
/*
* If called from an extended quiescent state, invoke the RCU
* core in order to force a re-evaluation of RCU's idleness.
*/
if (!rcu_is_watching())
invoke_rcu_core();
/* If interrupts were disabled or CPU offline, don't invoke RCU core. */
if (irqs_disabled_flags(flags) || cpu_is_offline(smp_processor_id()))
return;
/*
* Force the grace period if too many callbacks or too long waiting.
* Enforce hysteresis, and don't invoke force_quiescent_state()
* if some other CPU has recently done so. Also, don't bother
* invoking force_quiescent_state() if the newly enqueued callback
* is the only one waiting for a grace period to complete.
*/
if (unlikely(rcu_segcblist_n_cbs(&rdp->cblist) >
rdp->qlen_last_fqs_check + qhimark)) {
/* Are we ignoring a completed grace period? */
note_gp_changes(rsp, rdp);
/* Start a new grace period if one not already started. */
if (!rcu_gp_in_progress(rsp)) {
rcu_accelerate_cbs_unlocked(rsp, rdp->mynode, rdp);
} else {
/* Give the grace period a kick. */
rdp->blimit = LONG_MAX;
if (rsp->n_force_qs == rdp->n_force_qs_snap &&
rcu_segcblist_first_pend_cb(&rdp->cblist) != head)
force_quiescent_state(rsp);
rdp->n_force_qs_snap = rsp->n_force_qs;
rdp->qlen_last_fqs_check = rcu_segcblist_n_cbs(&rdp->cblist);
}
}
}
抛开nocb等代码,call_rcu其实就做了两件事:
- 调用rcu_segcblist_enqueue将callback_head挂到了单链表上面
- 调用__call_rcu_core做一下宽限期的判断
其中,函数rcu_segcblist_enqueue()使用了单链表的经典操作,稍微有些经验的开发者都能够很容易理解。而__call_rcu_core()函数的重点在于判断当前cpu的rcu_segcblist上面是否存在过多的回调,如果是的话需要做进一步的处理(比如说现在没在宽限期中就需要启动新的宽限期;若现在在宽限期中则尝试触发一次强制的静止状态)。
下面来看tick中的代码,来确认tick中是如何触发rcu回调的。
tick中会调用rcu_check_callbacks来进行rcu的相关处理。而rcu_check_callbacks会调用rcu_pending检查本处理器是否有rcu相关的工作需要处理,若有,则调用invoke_rcu_core触发rcu软中断
void update_process_times(int user_tick)
{
*****
rcu_check_callbacks(user_tick); //rcu相关处理
********
}
/*
* Check to see if there is any immediate RCU-related work to be done
* by the current CPU, returning 1 if so. This function is part of the
* RCU implementation; it is -not- an exported member of the RCU API.
*/
static int rcu_pending(void)
{
struct rcu_state *rsp;
for_each_rcu_flavor(rsp)
if (__rcu_pending(rsp, this_cpu_ptr(rsp->rda)))
return 1;
return 0;
}
/*
* Check to see if there is any immediate RCU-related work to be done
* by the current CPU, for the specified type of RCU, returning 1 if so.
* The checks are in order of increasing expense: checks that can be
* carried out against CPU-local state are performed first. However,
* we must check for CPU stalls first, else we might not get a chance.
*/
static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
{
***
/* Does this CPU have callbacks ready to invoke? */
if (rcu_segcblist_ready_cbs(&rdp->cblist))
return 1;
/* Has RCU gone idle with this CPU needing another grace period? */
//需要新的GP
if (!rcu_gp_in_progress(rsp) &&
rcu_segcblist_is_enabled(&rdp->cblist) &&
!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
return 1;
***
}
/*
* Does the specified rcu_segcblist structure contain callbacks that
* are ready to be invoked?
*/
//对于nocb cpu,永远返回false
bool rcu_segcblist_ready_cbs(struct rcu_segcblist *rsclp)
{
return rcu_segcblist_is_enabled(rsclp) &&
&rsclp->head != rsclp->tails[RCU_DONE_TAIL];
}
/*
* Are all segments following the specified segment of the specified
* rcu_segcblist structure empty of callbacks? (The specified
* segment might well contain callbacks.)
*/
static inline bool rcu_segcblist_restempty(struct rcu_segcblist *rsclp, int seg)
{
return !*rsclp->tails[seg];
}
我们只看和调用callback有关的。如果当前callback链表上有已经可以调用的callback,则返回1,表示需要rcu软中断处理。
如果对于系统启动之后第一次调用call_rcu的情况,callback链表显然是空的,则会走后续的逻辑,通过rcu_gp_in_progress判断是否处于宽限期中,且通过rcu_segcblist_restempty查看callback链表上的RCU_NEXT_READY_TAIL节点是否有callback在等待被调用。显然通过前面的call_rcu的插入,rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL)会返回false,从而函数rcu_pending会返回true进而触发rcu软中断。
那么rcu软中断中是如何完成callback函数的回调的呢?
/*
* Do RCU core processing for the current CPU.
*/
static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) //rcu软中断处理函数
{
struct rcu_state *rsp;
if (cpu_is_offline(smp_processor_id()))//只处理online cpu
return;
trace_rcu_utilization(TPS("Start RCU core"));
for_each_rcu_flavor(rsp)
__rcu_process_callbacks(rsp);
trace_rcu_utilization(TPS("End RCU core"));
}
/*
* This does the RCU core processing work for the specified rcu_state
* and rcu_data structures. This may be called only from the CPU to
* whom the rdp belongs.
*/static void
__rcu_process_callbacks(struct rcu_state *rsp)
{
***
/* No grace period and unregistered callbacks? */
if (!rcu_gp_in_progress(rsp) &&
rcu_segcblist_is_enabled(&rdp->cblist)) {
local_irq_save(flags);
if (!rcu_segcblist_restempty(&rdp->cblist, RCU_NEXT_READY_TAIL))
rcu_accelerate_cbs_unlocked(rsp, rnp, rdp); //这个函数里面会检查是否需要启动新的一个gp
local_irq_restore(flags);
}
rcu_check_gp_start_stall(rsp, rnp, rdp);
/* If there are callbacks ready, invoke them. */
if (rcu_segcblist_ready_cbs(&rdp->cblist)) //检查当前是否有需要处理的callbacks
invoke_rcu_callbacks(rsp, rdp); //触发执行callback
*****
}
/*
* Similar to rcu_accelerate_cbs(), but does not require that the leaf
* rcu_node structure's ->lock be held. It consults the cached value
* of ->gp_seq_needed in the rcu_data structure, and if that indicates
* that a new grace-period request be made, invokes rcu_accelerate_cbs()
* while holding the leaf rcu_node structure's ->lock.
*/
static void rcu_accelerate_cbs_unlocked(struct rcu_state *rsp,
struct rcu_node *rnp,
struct rcu_data *rdp)
{
unsigned long c;
bool needwake;
lockdep_assert_irqs_disabled();
c = rcu_seq_snap(&rsp->gp_seq);
if (!rdp->gpwrap && ULONG_CMP_GE(rdp->gp_seq_needed, c)) {
//宽限期结束了
/* Old request still live, so mark recent callbacks. */
(void)rcu_segcblist_accelerate(&rdp->cblist, c);
return;
}
//宽限期尚未结束
raw_spin_lock_rcu_node(rnp); /* irqs already disabled. */
needwake = rcu_accelerate_cbs(rsp, rnp, rdp);//加速回调函数处理
raw_spin_unlock_rcu_node(rnp); /* irqs remain disabled. */
if (needwake)
rcu_gp_kthread_wake(rsp); //唤醒gp线程
}
/*
* "Accelerate" callbacks based on more-accurate grace-period information.
* The reason for this is that RCU does not synchronize the beginnings and
* ends of grace periods, and that callbacks are posted locally. This in
* turn means that the callbacks must be labelled conservatively early 需要设置的保守些
* on, as getting exact information would degrade both performance and
* scalability. When more accurate grace-period information becomes
* available, previously posted callbacks can be "accelerated", marking
* them to complete at the end of the earlier grace period.
*
* This function operates on an rcu_segcblist structure, and also the
* grace-period sequence number seq at which new callbacks would become
* ready to invoke. Returns true if there are callbacks that won't be
* ready to invoke until seq, false otherwise. 看注释
*/
//如果子链表RCU_NEXT_READY_TAIL为空或者宽限期编号大于等于seq,那么把子链表RCU_next_tail合并到子链表RCU_NEXT_READY_tail中,并把宽限期编号更新为seq
//如果子链表RCU_WAIT_TAIL为空或者宽限期编号大于等于seq,那么把后面两个子链表合并到子链表RCU_WAIT_TAIL中,并把宽限期编号更新为seq
bool rcu_segcblist_accelerate(struct rcu_segcblist *rsclp, unsigned long seq)
{
int i;
WARN_ON_ONCE(!rcu_segcblist_is_enabled(rsclp));
if (rcu_segcblist_restempty(rsclp, RCU_DONE_TAIL))
return false;
/*
* Find the segment preceding the oldest segment of callbacks
* whose ->gp_seq[] completion is at or after that passed in via
* "seq", skipping any empty segments. This oldest segment, along
* with any later segments, can be merged in with any newly arrived
* callbacks in the RCU_NEXT_TAIL segment, and assigned "seq"
* as their ->gp_seq[] grace-period completion sequence number. 看注释
*/
//找到最后一个满足条件“****”的子链表
for (i = RCU_NEXT_READY_TAIL; i > RCU_DONE_TAIL; i--)
if (rsclp->tails[i] != rsclp->tails[i - 1] &&
ULONG_CMP_LT(rsclp->gp_seq[i], seq)) //rsclp->gp_seq[i] < seq
break;
/*
* If all the segments contain callbacks that correspond to
* earlier grace-period sequence numbers than "seq", leave. 看注释
* Assuming that the rcu_segcblist structure has enough
* segments in its arrays, this can only happen if some of
* the non-done segments contain callbacks that really are
* ready to invoke. This situation will get straightened
* out by the next call to rcu_segcblist_advance().
*
* Also advance to the oldest segment of callbacks whose
* ->gp_seq[] completion is at or after that passed in via "seq",
* skipping any empty segments.
*/
//注意!这里会自加
if (++i >= RCU_NEXT_TAIL) //这里貌似不会大于RCU_NEXT_TAIL吧?
return false;
/*
* Merge all later callbacks, including newly arrived callbacks,
* into the segment located by the for-loop above. Assign "seq"
* as the ->gp_seq[] value in order to correctly handle the case
* where there were no pending callbacks in the rcu_segcblist
* structure other than in the RCU_NEXT_TAIL segment.
*/
//到这里,i指向的rsclp->gp_seq[i]都是大于等于seq的
for (; i < RCU_NEXT_TAIL; i++) {
rsclp->tails[i] = rsclp->tails[RCU_NEXT_TAIL]; //挪动RCU_NEXT_TAIL,相当于把RCU_NEXT_TAIL链表合并到前面几个链表里面去
rsclp->gp_seq[i] = seq; //更新seq
}
return true;
}
/*
* Schedule RCU callback invocation. If the specified type of RCU
* does not support RCU priority boosting, just do a direct call,
* otherwise wake up the per-CPU kernel kthread. Note that because we
* are running on the current CPU with softirqs disabled, the
* rcu_cpu_kthread_task cannot disappear out from under us.
*/
static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
{
if (unlikely(!READ_ONCE(rcu_scheduler_fully_active)))
return;
if (likely(!rsp->boost)) {
rcu_do_batch(rsp, rdp);
return;
}
invoke_rcu_callbacks_kthread();
}
/*
* Invoke any RCU callbacks that have made it to the end of their grace
* period. Thottle as specified by rdp->blimit.
*/
static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
{
unsigned long flags;
struct rcu_head *rhp;
struct rcu_cblist rcl = RCU_CBLIST_INITIALIZER(rcl);
long bl, count;
/* If no callbacks are ready, just return. */
if (!rcu_segcblist_ready_cbs(&rdp->cblist)) {
trace_rcu_batch_start(rsp->name,
rcu_segcblist_n_lazy_cbs(&rdp->cblist),
rcu_segcblist_n_cbs(&rdp->cblist), 0);
trace_rcu_batch_end(rsp->name, 0,
!rcu_segcblist_empty(&rdp->cblist),
need_resched(), is_idle_task(current),
rcu_is_callbacks_kthread());
return;
}
/*
* Extract the list of ready callbacks, disabling to prevent
* races with call_rcu() from interrupt handlers. Leave the
* callback counts, as rcu_barrier() needs to be conservative.
*/
local_irq_save(flags);
WARN_ON_ONCE(cpu_is_offline(smp_processor_id()));
bl = rdp->blimit;
trace_rcu_batch_start(rsp->name, rcu_segcblist_n_lazy_cbs(&rdp->cblist),
rcu_segcblist_n_cbs(&rdp->cblist), bl);
rcu_segcblist_extract_done_cbs(&rdp->cblist, &rcl);
local_irq_restore(flags);
/* Invoke callbacks. */
rhp = rcu_cblist_dequeue(&rcl);
for (; rhp; rhp = rcu_cblist_dequeue(&rcl)) {
debug_rcu_head_unqueue(rhp);
if (__rcu_reclaim(rsp->name, rhp))
rcu_cblist_dequeued_lazy(&rcl);
/*
* Stop only if limit reached and CPU has something to do.
* Note: The rcl structure counts down from zero.
*/
if (-rcl.len >= bl &&
(need_resched() ||
(!is_idle_task(current) && !rcu_is_callbacks_kthread())))
break;
}
local_irq_save(flags);
count = -rcl.len;
trace_rcu_batch_end(rsp->name, count, !!rcl.head, need_resched(),
is_idle_task(current), rcu_is_callbacks_kthread());
/* Update counts and requeue any remaining callbacks. */
rcu_segcblist_insert_done_cbs(&rdp->cblist, &rcl);
smp_mb(); /* List handling before counting for rcu_barrier(). */
rcu_segcblist_insert_count(&rdp->cblist, &rcl);
/* Reinstate batch limit if we have worked down the excess. */
count = rcu_segcblist_n_cbs(&rdp->cblist);
if (rdp->blimit == LONG_MAX && count <= qlowmark)
rdp->blimit = blimit;
/* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
if (count == 0 && rdp->qlen_last_fqs_check != 0) {
rdp->qlen_last_fqs_check = 0;
rdp->n_force_qs_snap = rsp->n_force_qs;
} else if (count < rdp->qlen_last_fqs_check - qhimark)
rdp->qlen_last_fqs_check = count;
/*
* The following usually indicates a double call_rcu(). To track
* this down, try building with CONFIG_DEBUG_OBJECTS_RCU_HEAD=y.
*/
WARN_ON_ONCE(rcu_segcblist_empty(&rdp->cblist) != (count == 0));
local_irq_restore(flags);
/* Re-invoke RCU core processing if there are callbacks remaining. */
if (rcu_segcblist_ready_cbs(&rdp->cblist))
invoke_rcu_core();
}
代码比较复杂,简单来说,就是通过函数__rcu_process_callbacks()->rcu_accelerate_cbs_unlocked()->rcu_segcblist_accelerate()来调整链表中tails指向的位置,从而可以随着qs的触发而将callback往链表头移动;而如果有可以被调用的callback了,就调用函数__rcu_process_callbacks()->invoke_rcu_callbacks()->rcu_do_batch()->__rcu_reclaim()执行一个个的callback。细节不再展开了。