从一个CFS调度案例谈Linux系统卡顿的根源( 三 )

看来确实存在生产者和消费者配合在一起霸屏的现象 , 只是比较少而已 。
到底是什么原因让两个进程如此粘连在一起的呢?这个问题比较有意思 。
CFS调度器并没有兑现“消除trick , 保持简单”的承诺 , 越来越多的“启发式算法feature”被加入进去 , 重蹈了O ( 1 ) O(1)O(1)调度器的老路!!
我们从 /sys/kernel/debug/sched_features 中可以看到这些“trick式的feature” :
[root@localhost test]# cat /sys/kernel/debug/sched_featuresGENTLE_FAIR_SLEEPERS START_DEBIT NO_NEXT_BUDDY LAST_BUDDY CACHE_HOT_BUDDY WAKEUP_PREEMPTION ARCH_POWER NO_HRTICK NO_DOUBLE_TICK LB_BIAS NONTASK_POWER TTWU_QUEUE NO_FORCE_SD_OVERLAP RT_RUNTIME_SHARE NO_LB_MIN NO_NUMA NUMA_FAVOUR_HIGHER NO_NUMA_RESIST_LOWER对着文档一个个仔细看吧 。 这里我们只关心 LAST_BUDDY
如果review相关的代码的话 , 我们会发现wakeup操作的下面的片段:
static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags){ ...if (wakeup_preempt_entity(se, pse) == 1) {/** Bias pick_next to pick the sched entity that is* triggering this preemption.*/if (!next_buddy_marked)set_next_buddy(pse);goto preempt;}...preempt:resched_task(curr);/** Only set the backward buddy when the current task is still* on the rq. This can happen when a wakeup gets interleaved* with schedule on the ->pre_schedule() or idle_balance()* point, either of which can * drop the rq lock.** Also, during early boot the idle thread is in the fair class,* for obvious reasons its a bad idea to schedule back to it.*/if (unlikely(!se->on_rq || curr == rq->idle))return; // 这里是关键!设置了一个nextif (sched_feat(LAST_BUDDY) }以及pick_next操作的下面的片段:
/* 看这些注释就够了! * Pick the next process, keeping these things in mind, in this order: * 1) keep things fair between processes/task groups * 2) pick the "next" process, since someone really wants that to run * 3) pick the "last" process, for cache locality * 4) do not run the "skip" process, if something else is available */static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq){ ... /** Prefer last buddy, try to return the CPU to a preempted task.*/// next比last要优先!if (cfs_rq->last/** Someone really wants this to run. If it's not unfair, run it.*/if (cfs_rq->nextclear_buddies(cfs_rq, se);}这里就有疑点了 。
设生产者为P , 消费者为C , 则:

  • P唤醒了C , 那么P就会成为last 。
  • C投入运行 。
  • C运行结束进入阻塞 。
  • CPU进行调度 , pick next 。
  • last抢先leftmost而胜出 。 last的vruntime虽然比leftmost的大 , 但不足一个granularity , 因此优选last!
上述最后一点需要解释 , 我还是解释一下内核函数wakeup_preempt_entity的注释吧:
从一个CFS调度案例谈Linux系统卡顿的根源文章插图
事情到此为止 , 其实问题的解法已经有了 , 大致就是:
  • 禁用LAST_BUDDY feature.
而这个只需要一条命令即可:
[root@localhost test]# echo NO_LAST_BUDDY >/sys/kernel/debug/sched_features问题解决之后 , 却并未满足自己的好奇心 , 我还是希望看个究竟 。
为了挖出事情怎么发生的 , 仅靠perf就不够了 , 需要systemtap来帮忙 。
下面的脚本可以探测在进程被唤醒 , 调度切换的时候 , 细节到底是什么样的:
#!/usr/bin/stap -gglobal g_cfs_rq;probe begin { g_cfs_rq = 0;}function container_of_entity:long(se:long){ offset =return se - offset;}function container_to_entity:long(task:long){ offset =return task +offset;}function entity_to_rbnode:long(rb:long){ offset =return rb - offset;}function print_task(s:string, se:long, verbose:long, min_vruntime:long){ my_q = @cast(se, "struct sched_entity")->my_q; if(my_q == 0) {t_se = container_of_entity(se);printf("%8s %p%s %d \n", s, t_se, task_execname(t_se), task_pid(t_se)); }}probe kernel.function("pick_next_task_fair"){ printf("--------------- begin pick_next_task_fair --------------->\n"); g_cfs_rq = }probe kernel.function("pick_next_entity"){ if (g_cfs_rq == 0)next; printf("------- begin pick_next_entity ------->\n"); cfsq = g_cfs_rq; vrun_first = 0; vrun_last = 0; last = @cast(cfsq, "struct cfs_rq")->last; if (last) {my_q = @cast(last, "struct sched_entity")->my_q;if(my_q != 0) {cfsq = @cast(last, "struct sched_entity")->my_q;last = @cast(cfsq, "struct cfs_rq")->last;}t_last = container_of_entity(last);vrun_last = @cast(last, "struct sched_entity")->vruntime;printf("LAST:[%s] vrun:%d\t", task_execname(t_last), vrun_last); } firstrb = @cast(cfsq, "struct cfs_rq")->rb_leftmost; if (firstrb) {firstse = entity_to_rbnode(firstrb);my_q = @cast(firstse, "struct sched_entity")->my_q;if(my_q != 0) {firstrb = @cast(my_q, "struct cfs_rq")->rb_leftmost;firstse = entity_to_rbnode(firstrb);}t_first = container_of_entity(firstse);vrun_first = @cast(firstse, "struct sched_entity")->vruntime;printf("FIRST:[%s] vrun:%d\t", task_execname(t_first), vrun_first); } if (last} else {printf("delta: N/A\n"); } printf("