delete NIC from poll list in process_backlog()
lx
lxlenovostar at gmail.com
Tue Dec 23 21:42:18 EST 2014
hi all:
I think I got it. The new kernel just remove the NIC from poll list
first, If we should repoll it, kernel will add it again. The codes is:
4560 static void net_rx_action(struct softirq_action *h)
4561 {
4562 struct softnet_data *sd = this_cpu_ptr(&softnet_data);
4563 unsigned long time_limit = jiffies + 2;
4564 int budget = netdev_budget;
4565 LIST_HEAD(list);
4566 LIST_HEAD(repoll);
4567 void *have;
4568
4569 local_irq_disable();
4570 list_splice_init(&sd->poll_list, &list);
4571 local_irq_enable();
4572
4573 while (!list_empty(&list)) {
4574 struct napi_struct *n;
4575 int work, weight;
4576
4577 /* If softirq window is exhausted then punt.
4578 * Allow this to run for 2 jiffies since which will allow
4579 * an average latency of 1.5/HZ.
4580 */
4581 if (unlikely(budget <= 0 || time_after_eq(jiffies,
time_limit)))
4582 goto softnet_break;
4583
4584
4585 n = list_first_entry(&list, struct napi_struct, poll_list);
4586 list_del_init(&n->poll_list);
4587
4588 have = netpoll_poll_lock(n);
4589
4590 weight = n->weight;
4591
4592 /* This NAPI_STATE_SCHED test is for avoiding a race
4593 * with netpoll's poll_napi(). Only the entity which
4594 * obtains the lock and sees NAPI_STATE_SCHED set will
4595 * actually make the ->poll() call. Therefore we avoid
4596 * accidentally calling ->poll() when NAPI is not scheduled.
4597 */
4598 work = 0;
4599 if (test_bit(NAPI_STATE_SCHED, &n->state)) {
4600 work = n->poll(n, weight);
4601 trace_napi_poll(n);
4602 }
4603
4604 WARN_ON_ONCE(work > weight);
4605
4606 budget -= work;
4607
4608 /* Drivers must not modify the NAPI state if they
4609 * consume the entire weight. In such cases this code
4610 * still "owns" the NAPI instance and therefore can
4611 * move the instance around on the list at-will.
4612 */
4613 if (unlikely(work == weight)) {
4614 if (unlikely(napi_disable_pending(n))) {
4615 napi_complete(n);
4616 } else {
4617 if (n->gro_list) {
4618 /* flush too old packets
4619 * If HZ < 1000, flush all packets.
4620 */
4621 napi_gro_flush(n, HZ >= 1000);
4622 }
4623 list_add_tail(&n->poll_list, &repoll);
4624 }
4625 }
4626
4627 netpoll_poll_unlock(have);
4628 }
4629
4630 if (!sd_has_rps_ipi_waiting(sd) &&
4631 list_empty(&list) &&
4632 list_empty(&repoll))
4633 return;
4634 out:
4635 local_irq_disable();
4636
4637 list_splice_tail_init(&sd->poll_list, &list);
4638 list_splice_tail(&repoll, &list);
4639 list_splice(&list, &sd->poll_list);
4640 if (!list_empty(&sd->poll_list))
4641 __raise_softirq_irqoff(NET_RX_SOFTIRQ);
4642
4643 net_rps_action_and_irq_enable(sd);
4644
4645 return;
4646
4647 softnet_break:
4648 sd->time_squeeze++;
4649 goto out;
4650 }
line 4585: choose a NIC which will poll.
line 4586: remove the NIC from the poll list.
line 4623: add the NIC again in repoll list if we nedd poll it.
line 4638: splice the poll list and repoll list.
So, I think this is not a bug.
@valdis.kletnieks
I think we should remove NIC form poll list after the list of
sd->input_pkt_queue is empty, so I think code should run like this:
4350 rps_lock(sd);
4351 if (skb_queue_empty(&sd->input_pkt_queue)) {
4352 /*
4353 * Inline a custom version of __napi_complete().
4354 * only current cpu owns and manipulates this napi,
4355 * and NAPI_STATE_SCHED is the only possible flag set
4356 * on backlog.
4357 * We can use a plain write instead of clear_bit(),
4358 * and we dont need an smp_mb() memory barrier.
4359 */
list_del(&napi->poll_list); // I add it.
4360 napi->state = 0;
4361 rps_unlock(sd);
4362
4363 break;
4364 }
Thank you
在 2014/12/23 20:13, Vignesh Radhakrishnan 写道:
> We do delete it. I am not very thorough with the code , but here is what
> i think is happening.
>
> process_backlog is set as the backlog polling function for softnet data
> sd->backlog.poll = process_backlog;
>
> Now net_rx_action which runs in soft irq context, calls the polling as
> per the following code and the deletion as well.
>
>
> Notice where we are deleting it :
>
> n = list_first_entry(&list, struct napi_struct, poll_list);
> list_del_init(&n->poll_list);
> --------------------------------------> Here is the deletion.
>
> have = netpoll_poll_lock(n);
>
> weight = n->weight;
>
> /* This NAPI_STATE_SCHED test is for avoiding a race
> * with netpoll's poll_napi(). Only the entity which
> * obtains the lock and sees NAPI_STATE_SCHED set will
> * actually make the ->poll() call. Therefore we avoid
> * accidentally calling ->poll() when NAPI is not
> scheduled.
> */
> work = 0;
> if (test_bit(NAPI_STATE_SCHED, &n->state)) {
> work = n->poll(n, weight);
> -------------------------------------> Process_backlog is called here
> trace_napi_poll(n);
> }
>
> Earlier we had deletion in process_backlog itself, but
> https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d75b1ade567ffab085e8adbbdacf0092d10cd09c
> has changed it.
>
> Thanks and regards,
> Vignesh Radhakrishnan
>
> On Tue, Dec 23, 2014 at 2:29 PM, lx <lxlenovostar at gmail.com
> <mailto:lxlenovostar at gmail.com>> wrote:
>
> hi all:
> I read the function of process_backlog, and I found it don't
> delete the NIC form poll list. the codes is:
>
> 4321 static int process_backlog(struct napi_struct *napi, int quota)
> 4322 {
> 4323 int work = 0;
> 4324 struct softnet_data *sd = container_of(napi, struct
> softnet_data, backlog);
> 4325
> 4326 /* Check if we have pending ipi, its better to send them now,
> 4327 * not waiting net_rx_action() end.
> 4328 */
> 4329 if (sd_has_rps_ipi_waiting(sd)) {
> 4330 local_irq_disable();
> 4331 net_rps_action_and_irq_enable(sd);
> 4332 }
> 4333
> 4334 napi->weight = weight_p;
> 4335 local_irq_disable();
> 4336 while (1) {
> 4337 struct sk_buff *skb;
> 4338
> 4339 while ((skb = __skb_dequeue(&sd->process_queue))) {
> 4340 local_irq_enable();
> 4341 __netif_receive_skb(skb);
> 4342 local_irq_disable();
> 4343 input_queue_head_incr(sd);
> 4344 if (++work >= quota) {
> 4345 local_irq_enable();
> 4346 return work;
> 4347 }
> 4348 }
> 4349
> 4350 rps_lock(sd);
> 4351 if (skb_queue_empty(&sd->input_pkt_queue)) {
> 4352 /*
> 4353 * Inline a custom version of __napi_complete().
> 4354 * only current cpu owns and manipulates this napi,
> 4355 * and NAPI_STATE_SCHED is the only possible flag set
> 4356 * on backlog.
> 4357 * We can use a plain write instead of clear_bit(),
> 4358 * and we dont need an smp_mb() memory barrier.
> 4359 */
> 4360 napi->state = 0;
> 4361 rps_unlock(sd);
> 4362
> 4363 break;
> 4364 }
> 4365
> 4366 skb_queue_splice_tail_init(&sd->input_pkt_queue,
> 4367 &sd->process_queue);
> 4368 rps_unlock(sd);
> 4369 }
> 4370 local_irq_enable();
> 4371
> 4372 return work;
> 4373 }
>
> I think we should delete the NIC from the poll list by:
> list_del(&napi->poll_list);
>
> and my git repository infromation is:
>
> [root at localhost linux]# git show
> commit aa39477b5692611b91ac9455ae588738852b3f60
> Merge: 48ec833 5164bec
> Author: Linus Torvalds <torvalds at linux-foundation.org
> <mailto:torvalds at linux-foundation.org>>
> Date: Mon Dec 22 14:47:17 2014 -0800
>
> Merge tag 'dm-3.19-fixes' of
> git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
> <http://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm>
> Pull device mapper fixes from Mike Snitzer:
> "Thre stable fixes and one fix for a regression introduced
> during 3.19
> merge:
> - Fix inability to discard used space when the thin-pool
> target is in
> out-of-data-space mode and also transition the thin-pool
> back to
> write mode once free space is made available.
> - Fix DM core bio-based end_io bug that prevented proper
> post-processing of the error code returned from the block
> layer.
> - Fix crash in DM thin-pool due to thin device being added
> to the
> pool's active_thins list before properly initializing the thin
> device's refcount"
> * tag 'dm-3.19-fixes' of
> git://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm
> <http://git.kernel.org/pub/scm/linux/kernel/git/device-mapper/linux-dm>:
> dm: fix missed error code if .end_io isn't implemented by
> target_type
> dm thin: fix crash by initializing thin device's refcount and
> completion earlier
> dm thin: fix missing out-of-data-space to write mode
> transition if blocks are released
> dm thin: fix inability to discard blocks when in
> out-of-data-space mode
>
>
> Don't handle the list_del is right, or this is a bug?
>
> Thank you.
>
> _______________________________________________
> Kernelnewbies mailing list
> Kernelnewbies at kernelnewbies.org <mailto:Kernelnewbies at kernelnewbies.org>
> http://lists.kernelnewbies.org/mailman/listinfo/kernelnewbies
>
>
>
>
> --
> http://vigneshradhakrishnan.blogspot.com/
More information about the Kernelnewbies
mailing list