From 64f3b9e203bd06855072e295557dca1485a2ecba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 4 May 2011 10:02:26 +0000 Subject: [PATCH 01/27] net: ip_expire() must revalidate route Commit 4a94445c9a5c (net: Use ip_route_input_noref() in input path) added a bug in IP defragmentation handling, in case timeout is fired. When a frame is defragmented, we use last skb dst field when building final skb. Its dst is valid, since we are in rcu read section. But if a timeout occurs, we take first queued fragment to build one ICMP TIME EXCEEDED message. Problem is all queued skb have weak dst pointers, since we escaped RCU critical section after their queueing. icmp_send() might dereference a now freed (and possibly reused) part of memory. Calling skb_dst_drop() and ip_route_input_noref() to revalidate route is the only possible choice. Reported-by: Denys Fedoryshchenko Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_fragment.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index a1151b8adf3c..b1d282f11be7 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -223,31 +223,30 @@ static void ip_expire(unsigned long arg) if ((qp->q.last_in & INET_FRAG_FIRST_IN) && qp->q.fragments != NULL) { struct sk_buff *head = qp->q.fragments; + const struct iphdr *iph; + int err; rcu_read_lock(); head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out_rcu_unlock; + /* skb dst is stale, drop it, and perform route lookup again */ + skb_dst_drop(head); + iph = ip_hdr(head); + err = ip_route_input_noref(head, iph->daddr, iph->saddr, + iph->tos, head->dev); + if (err) + goto out_rcu_unlock; + /* - * Only search router table for the head fragment, - * when defraging timeout at PRE_ROUTING HOOK. + * Only an end host needs to send an ICMP + * "Fragment Reassembly Timeout" message, per RFC792. */ - if (qp->user == IP_DEFRAG_CONNTRACK_IN && !skb_dst(head)) { - const struct iphdr *iph = ip_hdr(head); - int err = ip_route_input(head, iph->daddr, iph->saddr, - iph->tos, head->dev); - if (unlikely(err)) - goto out_rcu_unlock; + if (qp->user == IP_DEFRAG_CONNTRACK_IN && + skb_rtable(head)->rt_type != RTN_LOCAL) + goto out_rcu_unlock; - /* - * Only an end host needs to send an ICMP - * "Fragment Reassembly Timeout" message, per RFC792. - */ - if (skb_rtable(head)->rt_type != RTN_LOCAL) - goto out_rcu_unlock; - - } /* Send an ICMP "Fragment Reassembly Timeout" message. */ icmp_send(head, ICMP_TIME_EXCEEDED, ICMP_EXC_FRAGTIME, 0); From 75bd0cbdc21d80859c80bdd5dd00125c1a3ccbca Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Thu, 28 Apr 2011 22:37:09 +0000 Subject: [PATCH 02/27] usbnet: runtime pm: fix out of memory This patch makes use of the EVENT_DEV_OPEN flag introduced recently to fix one out of memory issue, which can be reproduced on omap3/4 based pandaboard/beagle XM easily with steps below: - enable runtime pm echo auto > /sys/devices/platform/usbhs-omap.0/ehci-omap.0/usb1/1-1/1-1.1/power/control - ifconfig eth0 up - then out of memroy happened, see [1] for kernel message. Follows my analysis: - 'ifconfig eth0 up' brings eth0 out of suspend, and usbnet_resume is called to schedule dev->bh, then rx urbs are submited to prepare for recieving data; - some usbnet devices will produce garbage rx packets flood if info->reset is not called in usbnet_open. - so there is no enough chances for usbnet_bh to handle and release recieved skb buffers since many rx interrupts consumes cpu, so out of memory for atomic allocation in rx_submit happened. This patch fixes the issue by simply not allowing schedule of usbnet_bh until device is opened. [1], dmesg [ 234.712005] smsc95xx 1-1.1:1.0: rpm_resume flags 0x4 [ 234.712066] usb 1-1.1: rpm_resume flags 0x0 [ 234.712066] usb 1-1: rpm_resume flags 0x0 [ 234.712097] usb usb1: rpm_resume flags 0x0 [ 234.712127] usb usb1: usb auto-resume [ 234.712158] ehci-omap ehci-omap.0: resume root hub [ 234.754028] hub 1-0:1.0: hub_resume [ 234.754821] hub 1-0:1.0: port 1: status 0507 change 0000 [ 234.756011] hub 1-0:1.0: state 7 ports 3 chg 0000 evt 0000 [ 234.756042] hub 1-0:1.0: rpm_resume flags 0x4 [ 234.756072] usb usb1: rpm_resume flags 0x0 [ 234.756164] usb usb1: rpm_resume returns 1 [ 234.756195] hub 1-0:1.0: rpm_resume returns 0 [ 234.756195] hub 1-0:1.0: rpm_suspend flags 0x4 [ 234.756225] hub 1-0:1.0: rpm_suspend returns 0 [ 234.756256] usb usb1: rpm_resume returns 0 [ 234.757141] usb 1-1: usb auto-resume [ 234.793151] ehci-omap ehci-omap.0: GetStatus port:1 status 001005 0 ACK POWER sig=se0 PE CONNECT [ 234.816558] usb 1-1: finish resume [ 234.817871] hub 1-1:1.0: hub_resume [ 234.818420] hub 1-1:1.0: port 1: status 0507 change 0000 [ 234.820495] ehci-omap ehci-omap.0: reused qh eec50220 schedule [ 234.820495] usb 1-1: link qh256-0001/eec50220 start 1 [1/0 us] [ 234.820587] usb 1-1: rpm_resume returns 0 [ 234.820800] hub 1-1:1.0: state 7 ports 5 chg 0000 evt 0000 [ 234.820800] hub 1-1:1.0: rpm_resume flags 0x4 [ 234.820831] hub 1-1:1.0: rpm_resume returns 0 [ 234.820861] hub 1-1:1.0: rpm_suspend flags 0x4 [ 234.820861] hub 1-1:1.0: rpm_suspend returns 0 [ 234.821777] usb 1-1.1: usb auto-resume [ 234.868591] hub 1-1:1.0: state 7 ports 5 chg 0000 evt 0002 [ 234.868591] hub 1-1:1.0: rpm_resume flags 0x4 [ 234.868621] hub 1-1:1.0: rpm_resume returns 0 [ 234.868652] hub 1-1:1.0: rpm_suspend flags 0x4 [ 234.868652] hub 1-1:1.0: rpm_suspend returns 0 [ 234.879486] usb 1-1.1: finish resume [ 234.880279] usb 1-1.1: rpm_resume returns 0 [ 234.880310] smsc95xx 1-1.1:1.0: rpm_resume returns 0 [ 238.880187] ksoftirqd/0: page allocation failure. order:0, mode:0x20 [ 238.880218] Backtrace: [ 238.880249] [] (dump_backtrace+0x0/0xf8) from [] (dump_stack+0x18/0x1c) [ 238.880249] r6:00000000 r5:00000000 r4:00000020 r3:00000002 [ 238.880310] [] (dump_stack+0x0/0x1c) from [] (__alloc_pages_nodemask+0x620/0x724) [ 238.880340] [] (__alloc_pages_nodemask+0x0/0x724) from [] (kmem_getpages.clone.34+0x34/0xc8) [ 238.880371] [] (kmem_getpages.clone.34+0x0/0xc8) from [] (cache_grow.clone.42+0x84/0x154) [ 238.880371] r6:ef871aa4 r5:ef871a80 r4:ef81fd40 r3:00000020 [ 238.880401] [] (cache_grow.clone.42+0x0/0x154) from [] (cache_alloc_refill+0x19c/0x1f0) [ 238.880432] [] (cache_alloc_refill+0x0/0x1f0) from [] (kmem_cache_alloc+0x90/0x190) [ 238.880462] [] (kmem_cache_alloc+0x0/0x190) from [] (__alloc_skb+0x34/0xe8) [ 238.880493] [] (__alloc_skb+0x0/0xe8) from [] (rx_submit+0x2c/0x1d4 [usbnet]) [ 238.880523] [] (rx_submit+0x0/0x1d4 [usbnet]) from [] (rx_complete+0x19c/0x1b0 [usbnet]) [ 238.880737] [] (rx_complete+0x0/0x1b0 [usbnet]) from [] (usb_hcd_giveback_urb+0xa8/0xf4 [usbcore]) [ 238.880737] r8:eeeced34 r7:eeecec00 r6:eeecec00 r5:00000000 r4:eec2dd20 [ 238.880767] r3:bf050b9c [ 238.880859] [] (usb_hcd_giveback_urb+0x0/0xf4 [usbcore]) from [] (ehci_urb_done+0xb0/0xbc [ehci_hcd]) [ 238.880859] r6:00000000 r5:eec2dd20 r4:eeeced44 r3:eec2dd34 [ 238.880920] [] (ehci_urb_done+0x0/0xbc [ehci_hcd]) from [] (qh_completions+0x308/0x3bc [ehci_hcd]) [ 238.880920] r7:00000000 r6:eeda21a0 r5:ffdfe3c0 r4:eeda21ac [ 238.880981] [] (qh_completions+0x0/0x3bc [ehci_hcd]) from [] (scan_async+0xb0/0x16c [ehci_hcd]) [ 238.881011] [] (scan_async+0x0/0x16c [ehci_hcd]) from [] (ehci_work+0x38/0x90 [ehci_hcd]) [ 238.881042] [] (ehci_work+0x0/0x90 [ehci_hcd]) from [] (ehci_irq+0x300/0x34c [ehci_hcd]) [ 238.881072] r4:eeeced34 r3:00000001 [ 238.881134] [] (ehci_irq+0x0/0x34c [ehci_hcd]) from [] (usb_hcd_irq+0x40/0xac [usbcore]) [ 238.881195] [] (usb_hcd_irq+0x0/0xac [usbcore]) from [] (handle_irq_event_percpu+0xb8/0x240) [ 238.881225] r6:eec504e0 r5:0000006d r4:eec504e0 r3:bf0067e8 [ 238.881256] [] (handle_irq_event_percpu+0x0/0x240) from [] (handle_irq_event+0x44/0x64) [ 238.881256] [] (handle_irq_event+0x0/0x64) from [] (handle_level_irq+0xe0/0x114) [ 238.881286] r6:0000006d r5:c080c14c r4:c080c100 r3:00020000 [ 238.881317] [] (handle_level_irq+0x0/0x114) from [] (asm_do_IRQ+0x90/0xd0) [ 238.881317] r5:00000000 r4:0000006d [ 238.881347] [] (asm_do_IRQ+0x0/0xd0) from [] (__irq_svc+0x50/0x134) [ 238.881378] Exception stack(0xef837e20 to 0xef837e68) [ 238.881378] 7e20: 00000001 00185610 016cc000 c00490c0 eb380000 ef800540 00000020 00004ae0 [ 238.881408] 7e40: 00000020 bf0509f4 60000013 ef837e9c ef837e40 ef837e68 c0226f0c c0298ca0 [ 238.881408] 7e60: 20000013 ffffffff [ 238.881408] r5:fa240100 r4:ffffffff [ 238.881439] [] (__kmalloc_track_caller+0x0/0x1d0) from [] (__alloc_skb+0x58/0xe8) [ 238.881469] [] (__alloc_skb+0x0/0xe8) from [] (rx_submit+0x2c/0x1d4 [usbnet]) [ 238.881500] [] (rx_submit+0x0/0x1d4 [usbnet]) from [] (usbnet_bh+0x1b4/0x250 [usbnet]) [ 238.881530] [] (usbnet_bh+0x0/0x250 [usbnet]) from [] (tasklet_action+0xb0/0x1f8) [ 238.881530] r6:00000000 r5:ef9757f0 r4:ef9757ec r3:bf051224 [ 238.881561] [] (tasklet_action+0x0/0x1f8) from [] (__do_softirq+0x140/0x290) [ 238.881561] r8:00000006 r7:00000101 r6:00000000 r5:c0806098 r4:00000001 [ 238.881591] r3:c01f907c [ 238.881622] [] (__do_softirq+0x0/0x290) from [] (run_ksoftirqd+0xd0/0x1f4) [ 238.881622] [] (run_ksoftirqd+0x0/0x1f4) from [] (kthread+0x90/0x98) [ 238.881652] r7:00000013 r6:c01f98fc r5:00000000 r4:ef831efc [ 238.881683] [] (kthread+0x0/0x98) from [] (do_exit+0x0/0x374) [ 238.881713] r6:c01f62f4 r5:c0211320 r4:ef831efc [ 238.881713] Mem-info: [ 238.881744] Normal per-cpu: [ 238.881744] CPU 0: hi: 186, btch: 31 usd: 38 [ 238.881744] CPU 1: hi: 186, btch: 31 usd: 169 [ 238.881774] HighMem per-cpu: [ 238.881774] CPU 0: hi: 90, btch: 15 usd: 66 [ 238.881774] CPU 1: hi: 90, btch: 15 usd: 86 [ 238.881805] active_anon:544 inactive_anon:71 isolated_anon:0 [ 238.881805] active_file:926 inactive_file:2538 isolated_file:0 [ 238.881805] unevictable:0 dirty:10 writeback:0 unstable:0 [ 238.881805] free:57782 slab_reclaimable:864 slab_unreclaimable:186898 [ 238.881805] mapped:632 shmem:144 pagetables:50 bounce:0 [ 238.881835] Normal free:1328kB min:3532kB low:4412kB high:5296kB active_anon:0kB inactive_anon:0kB active_file:880kB inactive_file:848kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:780288kB mlocked:0kB dirty:36kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:3456kB slab_unreclaimable:747592kB kernel_stack:392kB pagetables:200kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no [ 238.881866] lowmem_reserve[]: 0 1904 1904 [ 238.881896] HighMem free:229800kB min:236kB low:508kB high:784kB active_anon:2176kB inactive_anon:284kB active_file:2824kB inactive_file:9304kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:243712kB mlocked:0kB dirty:4kB writeback:0kB mapped:2528kB shmem:576kB slab_reclaimable:0kB slab_unreclaimable:0kB kernel_stack:0kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no [ 238.881927] lowmem_reserve[]: 0 0 0 [ 238.881958] Normal: 0*4kB 4*8kB 6*16kB 0*32kB 1*64kB 1*128kB 0*256kB 2*512kB 0*1024kB 0*2048kB 0*4096kB = 1344kB [ 238.882019] HighMem: 6*4kB 2*8kB 4*16kB 4*32kB 1*64kB 1*128kB 0*256kB 2*512kB 3*1024kB 0*2048kB 55*4096kB = 229800kB [ 238.882080] 3610 total pagecache pages [ 238.882080] 0 pages in swap cache [ 238.882080] Swap cache stats: add 0, delete 0, find 0/0 [ 238.882110] Free swap = 0kB [ 238.882110] Total swap = 0kB [ 238.933776] 262144 pages of RAM [ 238.933776] 58240 free pages [ 238.933776] 10503 reserved pages [ 238.933776] 187773 slab pages [ 238.933807] 2475 pages shared [ 238.933807] 0 pages swap cached Signed-off-by: Ming Lei Acked-by: Oliver Neukum Signed-off-by: David S. Miller --- drivers/net/usb/usbnet.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 009bba3d753e..9ab439d144ed 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -645,6 +645,7 @@ int usbnet_stop (struct net_device *net) struct driver_info *info = dev->driver_info; int retval; + clear_bit(EVENT_DEV_OPEN, &dev->flags); netif_stop_queue (net); netif_info(dev, ifdown, dev->net, @@ -1524,9 +1525,12 @@ int usbnet_resume (struct usb_interface *intf) smp_mb(); clear_bit(EVENT_DEV_ASLEEP, &dev->flags); spin_unlock_irq(&dev->txq.lock); - if (!(dev->txq.qlen >= TX_QLEN(dev))) - netif_start_queue(dev->net); - tasklet_schedule (&dev->bh); + + if (test_bit(EVENT_DEV_OPEN, &dev->flags)) { + if (!(dev->txq.qlen >= TX_QLEN(dev))) + netif_start_queue(dev->net); + tasklet_schedule (&dev->bh); + } } return 0; } From 87e9af6cc67d842cd92b52b81f3f14e665e7ab05 Mon Sep 17 00:00:00 2001 From: Kurt Van Dijck Date: Mon, 2 May 2011 04:50:48 +0000 Subject: [PATCH 03/27] can: fix SJA1000 dlc for RTR packets RTR frames do have a valid data length code on CAN. The driver for SJA1000 did not handle that situation properly. Signed-off-by: Kurt Van Dijck Acked-by: Marc Kleine-Budde Signed-off-by: David S. Miller --- drivers/net/can/sja1000/sja1000.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/sja1000/sja1000.c b/drivers/net/can/sja1000/sja1000.c index a358ea9445a2..f501bba1fc6f 100644 --- a/drivers/net/can/sja1000/sja1000.c +++ b/drivers/net/can/sja1000/sja1000.c @@ -346,10 +346,10 @@ static void sja1000_rx(struct net_device *dev) | (priv->read_reg(priv, REG_ID2) >> 5); } + cf->can_dlc = get_can_dlc(fi & 0x0F); if (fi & FI_RTR) { id |= CAN_RTR_FLAG; } else { - cf->can_dlc = get_can_dlc(fi & 0x0F); for (i = 0; i < cf->can_dlc; i++) cf->data[i] = priv->read_reg(priv, dreg++); } From a294865978b701e4d0d90135672749531b9a900d Mon Sep 17 00:00:00 2001 From: Dan Rosenberg Date: Fri, 6 May 2011 03:27:18 +0000 Subject: [PATCH 04/27] dccp: handle invalid feature options length A length of zero (after subtracting two for the type and len fields) for the DCCPO_{CHANGE,CONFIRM}_{L,R} options will cause an underflow due to the subtraction. The subsequent code may read past the end of the options value buffer when parsing. I'm unsure of what the consequences of this might be, but it's probably not good. Signed-off-by: Dan Rosenberg Cc: stable@kernel.org Acked-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/options.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/dccp/options.c b/net/dccp/options.c index f06ffcfc8d71..4b2ab657ac8e 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -123,6 +123,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, case DCCPO_CHANGE_L ... DCCPO_CONFIRM_R: if (pkt_type == DCCP_PKT_DATA) /* RFC 4340, 6 */ break; + if (len == 0) + goto out_invalid_option; rc = dccp_feat_parse_options(sk, dreq, mandatory, opt, *value, value + 1, len - 1); if (rc) From e328d410826d52e9ee348aff9064c4a207f2adb1 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Fri, 6 May 2011 08:32:53 +0000 Subject: [PATCH 05/27] vmxnet3: Consistently disable irqs when taking adapter->cmd_lock Using the vmxnet3 driver produces a lockdep warning because vmxnet3_set_mc(), which is called with mc->mca_lock held, takes adapter->cmd_lock. However, there are a couple of places where adapter->cmd_lock is taken with softirqs enabled, lockdep warns that a softirq that tries to take mc->mca_lock could happen while adapter->cmd_lock is held, leading to an AB-BA deadlock. I'm not sure if this is a real potential deadlock or not, but the simplest and best fix seems to be simply to make sure we take cmd_lock with spin_lock_irqsave() everywhere -- the places with plain spin_lock just look like oversights. The full enormous lockdep warning is: ========================================================= [ INFO: possible irq lock inversion dependency detected ] 2.6.39-rc6+ #1 --------------------------------------------------------- ifconfig/567 just changed the state of lock: (&(&mc->mca_lock)->rlock){+.-...}, at: [] mld_ifc_timer_expire+0xff/0x280 but this lock took another, SOFTIRQ-unsafe lock in the past: (&(&adapter->cmd_lock)->rlock){+.+...} and interrupts could create inverse lock ordering between them. other info that might help us debug this: 4 locks held by ifconfig/567: #0: (rtnl_mutex){+.+.+.}, at: [] rtnl_lock+0x17/0x20 #1: ((inetaddr_chain).rwsem){.+.+.+}, at: [] __blocking_notifier_call_chain+0x5f/0xb0 #2: (&idev->mc_ifc_timer){+.-...}, at: [] run_timer_softirq+0xeb/0x3f0 #3: (&ndev->lock){++.-..}, at: [] mld_ifc_timer_expire+0x32/0x280 the shortest dependencies between 2nd lock and 1st lock: -> (&(&adapter->cmd_lock)->rlock){+.+...} ops: 11 { HARDIRQ-ON-W at: [] __lock_acquire+0x7f6/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock+0x36/0x70 [] vmxnet3_alloc_intr_resources+0x22/0x230 [vmxnet3] [] vmxnet3_probe_device+0x5f6/0x15c5 [vmxnet3] [] local_pci_probe+0x5f/0xd0 [] pci_device_probe+0x119/0x120 [] driver_probe_device+0x96/0x1c0 [] __driver_attach+0xab/0xb0 [] bus_for_each_dev+0x5e/0x90 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xc8/0x290 [] driver_register+0x76/0x140 [] __pci_register_driver+0x66/0xe0 [] serio_raw_poll+0x3a/0x60 [serio_raw] [] do_one_initcall+0x45/0x190 [] sys_init_module+0xfb/0x250 [] system_call_fastpath+0x16/0x1b SOFTIRQ-ON-W at: [] __lock_acquire+0x827/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock+0x36/0x70 [] vmxnet3_alloc_intr_resources+0x22/0x230 [vmxnet3] [] vmxnet3_probe_device+0x5f6/0x15c5 [vmxnet3] [] local_pci_probe+0x5f/0xd0 [] pci_device_probe+0x119/0x120 [] driver_probe_device+0x96/0x1c0 [] __driver_attach+0xab/0xb0 [] bus_for_each_dev+0x5e/0x90 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xc8/0x290 [] driver_register+0x76/0x140 [] __pci_register_driver+0x66/0xe0 [] serio_raw_poll+0x3a/0x60 [serio_raw] [] do_one_initcall+0x45/0x190 [] sys_init_module+0xfb/0x250 [] system_call_fastpath+0x16/0x1b INITIAL USE at: [] __lock_acquire+0x459/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock+0x36/0x70 [] vmxnet3_alloc_intr_resources+0x22/0x230 [vmxnet3] [] vmxnet3_probe_device+0x5f6/0x15c5 [vmxnet3] [] local_pci_probe+0x5f/0xd0 [] pci_device_probe+0x119/0x120 [] driver_probe_device+0x96/0x1c0 [] __driver_attach+0xab/0xb0 [] bus_for_each_dev+0x5e/0x90 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xc8/0x290 [] driver_register+0x76/0x140 [] __pci_register_driver+0x66/0xe0 [] serio_raw_poll+0x3a/0x60 [serio_raw] [] do_one_initcall+0x45/0x190 [] sys_init_module+0xfb/0x250 [] system_call_fastpath+0x16/0x1b } ... key at: [] __key.42516+0x0/0xffffffffffffda70 [vmxnet3] ... acquired at: [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_irqsave+0x55/0xa0 [] vmxnet3_set_mc+0x97/0x1a0 [vmxnet3] [] __dev_set_rx_mode+0x40/0xb0 [] dev_set_rx_mode+0x30/0x50 [] __dev_open+0xc7/0x100 [] __dev_change_flags+0xa1/0x180 [] dev_change_flags+0x28/0x70 [] devinet_ioctl+0x730/0x800 [] inet_ioctl+0x88/0xa0 [] sock_do_ioctl+0x30/0x70 [] sock_ioctl+0x79/0x2f0 [] do_vfs_ioctl+0x98/0x570 [] sys_ioctl+0x91/0xa0 [] system_call_fastpath+0x16/0x1b -> (_xmit_ETHER){+.....} ops: 6 { HARDIRQ-ON-W at: [] __lock_acquire+0x7f6/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] __dev_mc_add+0x38/0x90 [] dev_mc_add+0x10/0x20 [] igmp6_group_added+0x10e/0x1b0 [] ipv6_dev_mc_inc+0x2cd/0x430 [] ipv6_add_dev+0x357/0x450 [] addrconf_notify+0x2f7/0xb10 [] notifier_call_chain+0x8c/0xc0 [] raw_notifier_call_chain+0x16/0x20 [] call_netdevice_notifiers+0x37/0x70 [] register_netdevice+0x244/0x2d0 [] register_netdev+0x3f/0x60 [] vmxnet3_probe_device+0x760/0x15c5 [vmxnet3] [] local_pci_probe+0x5f/0xd0 [] pci_device_probe+0x119/0x120 [] driver_probe_device+0x96/0x1c0 [] __driver_attach+0xab/0xb0 [] bus_for_each_dev+0x5e/0x90 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xc8/0x290 [] driver_register+0x76/0x140 [] __pci_register_driver+0x66/0xe0 [] serio_raw_poll+0x3a/0x60 [serio_raw] [] do_one_initcall+0x45/0x190 [] sys_init_module+0xfb/0x250 [] system_call_fastpath+0x16/0x1b INITIAL USE at: [] __lock_acquire+0x459/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] __dev_mc_add+0x38/0x90 [] dev_mc_add+0x10/0x20 [] igmp6_group_added+0x10e/0x1b0 [] ipv6_dev_mc_inc+0x2cd/0x430 [] ipv6_add_dev+0x357/0x450 [] addrconf_notify+0x2f7/0xb10 [] notifier_call_chain+0x8c/0xc0 [] raw_notifier_call_chain+0x16/0x20 [] call_netdevice_notifiers+0x37/0x70 [] register_netdevice+0x244/0x2d0 [] register_netdev+0x3f/0x60 [] vmxnet3_probe_device+0x760/0x15c5 [vmxnet3] [] local_pci_probe+0x5f/0xd0 [] pci_device_probe+0x119/0x120 [] driver_probe_device+0x96/0x1c0 [] __driver_attach+0xab/0xb0 [] bus_for_each_dev+0x5e/0x90 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xc8/0x290 [] driver_register+0x76/0x140 [] __pci_register_driver+0x66/0xe0 [] serio_raw_poll+0x3a/0x60 [serio_raw] [] do_one_initcall+0x45/0x190 [] sys_init_module+0xfb/0x250 [] system_call_fastpath+0x16/0x1b } ... key at: [] netdev_addr_lock_key+0x8/0x1e0 ... acquired at: [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] __dev_mc_add+0x38/0x90 [] dev_mc_add+0x10/0x20 [] igmp6_group_added+0x10e/0x1b0 [] ipv6_dev_mc_inc+0x2cd/0x430 [] ipv6_add_dev+0x357/0x450 [] addrconf_notify+0x2f7/0xb10 [] notifier_call_chain+0x8c/0xc0 [] raw_notifier_call_chain+0x16/0x20 [] call_netdevice_notifiers+0x37/0x70 [] register_netdevice+0x244/0x2d0 [] register_netdev+0x3f/0x60 [] vmxnet3_probe_device+0x760/0x15c5 [vmxnet3] [] local_pci_probe+0x5f/0xd0 [] pci_device_probe+0x119/0x120 [] driver_probe_device+0x96/0x1c0 [] __driver_attach+0xab/0xb0 [] bus_for_each_dev+0x5e/0x90 [] driver_attach+0x1e/0x20 [] bus_add_driver+0xc8/0x290 [] driver_register+0x76/0x140 [] __pci_register_driver+0x66/0xe0 [] serio_raw_poll+0x3a/0x60 [serio_raw] [] do_one_initcall+0x45/0x190 [] sys_init_module+0xfb/0x250 [] system_call_fastpath+0x16/0x1b -> (&(&mc->mca_lock)->rlock){+.-...} ops: 6 { HARDIRQ-ON-W at: [] __lock_acquire+0x7f6/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] igmp6_group_added+0x45/0x1b0 [] ipv6_dev_mc_inc+0x2cd/0x430 [] ipv6_add_dev+0x357/0x450 [] addrconf_init+0x4e/0x183 [] inet6_init+0x191/0x2a6 [] do_one_initcall+0x45/0x190 [] kernel_init+0xe3/0x168 [] kernel_thread_helper+0x4/0x10 IN-SOFTIRQ-W at: [] __lock_acquire+0x7ce/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] mld_ifc_timer_expire+0xff/0x280 [] run_timer_softirq+0x179/0x3f0 [] __do_softirq+0xc0/0x210 [] call_softirq+0x1c/0x30 [] do_softirq+0xad/0xe0 [] irq_exit+0x9e/0xb0 [] smp_apic_timer_interrupt+0x70/0x9b [] apic_timer_interrupt+0x13/0x20 [] rt_do_flush+0x87/0x2a0 [] rt_cache_flush+0x46/0x60 [] fib_disable_ip+0x40/0x60 [] fib_inetaddr_event+0xd7/0xe0 [] notifier_call_chain+0x8c/0xc0 [] __blocking_notifier_call_chain+0x78/0xb0 [] blocking_notifier_call_chain+0x16/0x20 [] __inet_del_ifa+0xf1/0x2e0 [] inet_del_ifa+0x13/0x20 [] devinet_ioctl+0x501/0x800 [] inet_ioctl+0x88/0xa0 [] sock_do_ioctl+0x30/0x70 [] sock_ioctl+0x79/0x2f0 [] do_vfs_ioctl+0x98/0x570 [] sys_ioctl+0x91/0xa0 [] system_call_fastpath+0x16/0x1b INITIAL USE at: [] __lock_acquire+0x459/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] igmp6_group_added+0x45/0x1b0 [] ipv6_dev_mc_inc+0x2cd/0x430 [] ipv6_add_dev+0x357/0x450 [] addrconf_init+0x4e/0x183 [] inet6_init+0x191/0x2a6 [] do_one_initcall+0x45/0x190 [] kernel_init+0xe3/0x168 [] kernel_thread_helper+0x4/0x10 } ... key at: [] __key.40877+0x0/0x8 ... acquired at: [] check_usage_forwards+0x9c/0x110 [] mark_lock+0x19c/0x400 [] __lock_acquire+0x7ce/0x1e10 [] lock_acquire+0x9d/0x130 [] _raw_spin_lock_bh+0x3b/0x70 [] mld_ifc_timer_expire+0xff/0x280 [] run_timer_softirq+0x179/0x3f0 [] __do_softirq+0xc0/0x210 [] call_softirq+0x1c/0x30 [] do_softirq+0xad/0xe0 [] irq_exit+0x9e/0xb0 [] smp_apic_timer_interrupt+0x70/0x9b [] apic_timer_interrupt+0x13/0x20 [] rt_do_flush+0x87/0x2a0 [] rt_cache_flush+0x46/0x60 [] fib_disable_ip+0x40/0x60 [] fib_inetaddr_event+0xd7/0xe0 [] notifier_call_chain+0x8c/0xc0 [] __blocking_notifier_call_chain+0x78/0xb0 [] blocking_notifier_call_chain+0x16/0x20 [] __inet_del_ifa+0xf1/0x2e0 [] inet_del_ifa+0x13/0x20 [] devinet_ioctl+0x501/0x800 [] inet_ioctl+0x88/0xa0 [] sock_do_ioctl+0x30/0x70 [] sock_ioctl+0x79/0x2f0 [] do_vfs_ioctl+0x98/0x570 [] sys_ioctl+0x91/0xa0 [] system_call_fastpath+0x16/0x1b stack backtrace: Pid: 567, comm: ifconfig Not tainted 2.6.39-rc6+ #1 Call Trace: [] print_irq_inversion_bug+0x146/0x170 [] ? print_irq_inversion_bug+0x170/0x170 [] check_usage_forwards+0x9c/0x110 [] mark_lock+0x19c/0x400 [] __lock_acquire+0x7ce/0x1e10 [] ? mark_lock+0x1f3/0x400 [] ? __lock_acquire+0xf07/0x1e10 [] ? native_sched_clock+0x15/0x70 [] lock_acquire+0x9d/0x130 [] ? mld_ifc_timer_expire+0xff/0x280 [] ? lock_release_holdtime+0x3d/0x1a0 [] _raw_spin_lock_bh+0x3b/0x70 [] ? mld_ifc_timer_expire+0xff/0x280 [] ? _raw_spin_unlock+0x2b/0x40 [] mld_ifc_timer_expire+0xff/0x280 [] run_timer_softirq+0x179/0x3f0 [] ? run_timer_softirq+0xeb/0x3f0 [] ? sched_clock+0x9/0x10 [] ? mld_gq_timer_expire+0x30/0x30 [] __do_softirq+0xc0/0x210 [] ? tick_program_event+0x1f/0x30 [] call_softirq+0x1c/0x30 [] do_softirq+0xad/0xe0 [] irq_exit+0x9e/0xb0 [] smp_apic_timer_interrupt+0x70/0x9b [] apic_timer_interrupt+0x13/0x20 [] ? retint_restore_args+0x13/0x13 [] ? lock_is_held+0x17/0xd0 [] rt_do_flush+0x87/0x2a0 [] rt_cache_flush+0x46/0x60 [] fib_disable_ip+0x40/0x60 [] fib_inetaddr_event+0xd7/0xe0 [] notifier_call_chain+0x8c/0xc0 [] __blocking_notifier_call_chain+0x78/0xb0 [] blocking_notifier_call_chain+0x16/0x20 [] __inet_del_ifa+0xf1/0x2e0 [] inet_del_ifa+0x13/0x20 [] devinet_ioctl+0x501/0x800 [] ? local_clock+0x6f/0x80 [] ? do_page_fault+0x268/0x560 [] inet_ioctl+0x88/0xa0 [] sock_do_ioctl+0x30/0x70 [] sock_ioctl+0x79/0x2f0 [] ? __call_rcu+0xa7/0x190 [] do_vfs_ioctl+0x98/0x570 [] ? fget_light+0x33e/0x430 [] ? retint_swapgs+0x13/0x1b [] sys_ioctl+0x91/0xa0 [] system_call_fastpath+0x16/0x1b Signed-off-by: Roland Dreier Signed-off-by: Shreyas N Bhatewara Signed-off-by: Scott J. Goldman Signed-off-by: David S. Miller --- drivers/net/vmxnet3/vmxnet3_drv.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/net/vmxnet3/vmxnet3_drv.c b/drivers/net/vmxnet3/vmxnet3_drv.c index 0d47c3a05307..c16ed961153a 100644 --- a/drivers/net/vmxnet3/vmxnet3_drv.c +++ b/drivers/net/vmxnet3/vmxnet3_drv.c @@ -178,6 +178,7 @@ static void vmxnet3_process_events(struct vmxnet3_adapter *adapter) { int i; + unsigned long flags; u32 events = le32_to_cpu(adapter->shared->ecr); if (!events) return; @@ -190,10 +191,10 @@ vmxnet3_process_events(struct vmxnet3_adapter *adapter) /* Check if there is an error on xmit/recv queues */ if (events & (VMXNET3_ECR_TQERR | VMXNET3_ECR_RQERR)) { - spin_lock(&adapter->cmd_lock); + spin_lock_irqsave(&adapter->cmd_lock, flags); VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, VMXNET3_CMD_GET_QUEUE_STATUS); - spin_unlock(&adapter->cmd_lock); + spin_unlock_irqrestore(&adapter->cmd_lock, flags); for (i = 0; i < adapter->num_tx_queues; i++) if (adapter->tqd_start[i].status.stopped) @@ -2733,13 +2734,14 @@ static void vmxnet3_alloc_intr_resources(struct vmxnet3_adapter *adapter) { u32 cfg; + unsigned long flags; /* intr settings */ - spin_lock(&adapter->cmd_lock); + spin_lock_irqsave(&adapter->cmd_lock, flags); VMXNET3_WRITE_BAR1_REG(adapter, VMXNET3_REG_CMD, VMXNET3_CMD_GET_CONF_INTR); cfg = VMXNET3_READ_BAR1_REG(adapter, VMXNET3_REG_CMD); - spin_unlock(&adapter->cmd_lock); + spin_unlock_irqrestore(&adapter->cmd_lock, flags); adapter->intr.type = cfg & 0x3; adapter->intr.mask_mode = (cfg >> 2) & 0x3; From 9c412942a0bb19ba18f7bd939d42eff1e132a901 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 3 May 2011 07:49:25 +0000 Subject: [PATCH 06/27] ipheth: Properly distinguish length and alignment in URBs and skbs The USB protocol this driver implements appears to require 2 bytes of padding in front of each received packet. This used to be equal to the value of NET_IP_ALIGN on x86, so the driver abused that constant and mostly worked, but this is no longer the case. The driver also mixed up the URB and packet lengths, resulting in 2 bytes of junk at the end of the skb. Introduce a private constant for the 2 bytes of padding; fix this confusion and check for the under-length case. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- drivers/net/usb/ipheth.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/drivers/net/usb/ipheth.c b/drivers/net/usb/ipheth.c index 7d42f9a2c068..81126ff85e05 100644 --- a/drivers/net/usb/ipheth.c +++ b/drivers/net/usb/ipheth.c @@ -65,6 +65,7 @@ #define IPHETH_USBINTF_PROTO 1 #define IPHETH_BUF_SIZE 1516 +#define IPHETH_IP_ALIGN 2 /* padding at front of URB */ #define IPHETH_TX_TIMEOUT (5 * HZ) #define IPHETH_INTFNUM 2 @@ -202,18 +203,21 @@ static void ipheth_rcvbulk_callback(struct urb *urb) return; } - len = urb->actual_length; - buf = urb->transfer_buffer; + if (urb->actual_length <= IPHETH_IP_ALIGN) { + dev->net->stats.rx_length_errors++; + return; + } + len = urb->actual_length - IPHETH_IP_ALIGN; + buf = urb->transfer_buffer + IPHETH_IP_ALIGN; - skb = dev_alloc_skb(NET_IP_ALIGN + len); + skb = dev_alloc_skb(len); if (!skb) { err("%s: dev_alloc_skb: -ENOMEM", __func__); dev->net->stats.rx_dropped++; return; } - skb_reserve(skb, NET_IP_ALIGN); - memcpy(skb_put(skb, len), buf + NET_IP_ALIGN, len - NET_IP_ALIGN); + memcpy(skb_put(skb, len), buf, len); skb->dev = dev->net; skb->protocol = eth_type_trans(skb, dev->net); From b9f47a3aaeabdce3b42829bbb27765fa340f76ba Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Wed, 4 May 2011 10:04:56 +0000 Subject: [PATCH 07/27] tcp_cubic: limit delayed_ack ratio to prevent divide error TCP Cubic keeps a metric that estimates the amount of delayed acknowledgements to use in adjusting the window. If an abnormally large number of packets are acknowledged at once, then the update could wrap and reach zero. This kind of ACK could only happen when there was a large window and huge number of ACK's were lost. This patch limits the value of delayed ack ratio. The choice of 32 is just a conservative value since normally it should be range of 1 to 4 packets. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 34340c9c95fa..f376b05cca81 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -93,6 +93,7 @@ struct bictcp { u32 ack_cnt; /* number of acks */ u32 tcp_cwnd; /* estimated tcp cwnd */ #define ACK_RATIO_SHIFT 4 +#define ACK_RATIO_LIMIT (32u << ACK_RATIO_SHIFT) u16 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ u8 sample_cnt; /* number of samples to decide curr_rtt */ u8 found; /* the exit point is found? */ @@ -398,8 +399,12 @@ static void bictcp_acked(struct sock *sk, u32 cnt, s32 rtt_us) u32 delay; if (icsk->icsk_ca_state == TCP_CA_Open) { - cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; - ca->delayed_ack += cnt; + u32 ratio = ca->delayed_ack; + + ratio -= ca->delayed_ack >> ACK_RATIO_SHIFT; + ratio += cnt; + + ca->delayed_ack = min(ratio, ACK_RATIO_LIMIT); } /* Some calls are for duplicates without timetamps */ From dcbe14b91a920657ff3a9ba0efb7c5b5562f956a Mon Sep 17 00:00:00 2001 From: Kleber Sacilotto de Souza Date: Wed, 4 May 2011 13:05:11 +0000 Subject: [PATCH 08/27] ehea: fix wrongly reported speed and port Currently EHEA reports to ethtool as supporting 10M, 100M, 1G and 10G and connected to FIBRE independent of the hardware configuration. However, when connected to FIBRE the only supported speed is 10G full-duplex, and the other speeds and modes are only supported when connected to twisted pair. Signed-off-by: Kleber Sacilotto de Souza Acked-by: Breno Leitao Signed-off-by: David S. Miller --- drivers/net/ehea/ehea_ethtool.c | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/drivers/net/ehea/ehea_ethtool.c b/drivers/net/ehea/ehea_ethtool.c index 3e2e734fecb7..f3bbdcef338c 100644 --- a/drivers/net/ehea/ehea_ethtool.c +++ b/drivers/net/ehea/ehea_ethtool.c @@ -55,15 +55,20 @@ static int ehea_get_settings(struct net_device *dev, struct ethtool_cmd *cmd) cmd->duplex = -1; } - cmd->supported = (SUPPORTED_10000baseT_Full | SUPPORTED_1000baseT_Full - | SUPPORTED_100baseT_Full | SUPPORTED_100baseT_Half - | SUPPORTED_10baseT_Full | SUPPORTED_10baseT_Half - | SUPPORTED_Autoneg | SUPPORTED_FIBRE); + if (cmd->speed == SPEED_10000) { + cmd->supported = (SUPPORTED_10000baseT_Full | SUPPORTED_FIBRE); + cmd->advertising = (ADVERTISED_10000baseT_Full | ADVERTISED_FIBRE); + cmd->port = PORT_FIBRE; + } else { + cmd->supported = (SUPPORTED_1000baseT_Full | SUPPORTED_100baseT_Full + | SUPPORTED_100baseT_Half | SUPPORTED_10baseT_Full + | SUPPORTED_10baseT_Half | SUPPORTED_Autoneg + | SUPPORTED_TP); + cmd->advertising = (ADVERTISED_1000baseT_Full | ADVERTISED_Autoneg + | ADVERTISED_TP); + cmd->port = PORT_TP; + } - cmd->advertising = (ADVERTISED_10000baseT_Full | ADVERTISED_Autoneg - | ADVERTISED_FIBRE); - - cmd->port = PORT_FIBRE; cmd->autoneg = port->autoneg == 1 ? AUTONEG_ENABLE : AUTONEG_DISABLE; return 0; From 6709d9521df05c105343473ab8b147e2ef1e13d8 Mon Sep 17 00:00:00 2001 From: Somnath Kotur Date: Wed, 4 May 2011 22:40:46 +0000 Subject: [PATCH 09/27] be2net: Fixed bugs related to PVID. Fixed bug to make sure 'pvid' retrieval will work on big endian hosts. Fixed incorrect comparison between the Rx Completion's 16-bit VLAN TCI and the PVID. Now comparing only the relevant 12 bits corresponding to the VID. Renamed 'vid' field under Rx Completion to 'vlan_tag' to reflect accurate description. Signed-off-by: Somnath Kotur Signed-off-by: David S. Miller --- drivers/net/benet/be.h | 2 +- drivers/net/benet/be_cmds.c | 2 +- drivers/net/benet/be_main.c | 18 ++++++++++++------ 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/drivers/net/benet/be.h b/drivers/net/benet/be.h index 66823eded7a3..2353eca32593 100644 --- a/drivers/net/benet/be.h +++ b/drivers/net/benet/be.h @@ -213,7 +213,7 @@ struct be_rx_stats { struct be_rx_compl_info { u32 rss_hash; - u16 vid; + u16 vlan_tag; u16 pkt_size; u16 rxq_idx; u16 mac_id; diff --git a/drivers/net/benet/be_cmds.c b/drivers/net/benet/be_cmds.c index 1e2d825bb94a..9dc9394fd4ca 100644 --- a/drivers/net/benet/be_cmds.c +++ b/drivers/net/benet/be_cmds.c @@ -132,7 +132,7 @@ static void be_async_grp5_pvid_state_process(struct be_adapter *adapter, struct be_async_event_grp5_pvid_state *evt) { if (evt->enabled) - adapter->pvid = evt->tag; + adapter->pvid = le16_to_cpu(evt->tag); else adapter->pvid = 0; } diff --git a/drivers/net/benet/be_main.c b/drivers/net/benet/be_main.c index 02a0443d1821..9187fb4e08f1 100644 --- a/drivers/net/benet/be_main.c +++ b/drivers/net/benet/be_main.c @@ -1018,7 +1018,8 @@ static void be_rx_compl_process(struct be_adapter *adapter, kfree_skb(skb); return; } - vlan_hwaccel_receive_skb(skb, adapter->vlan_grp, rxcp->vid); + vlan_hwaccel_receive_skb(skb, adapter->vlan_grp, + rxcp->vlan_tag); } else { netif_receive_skb(skb); } @@ -1076,7 +1077,8 @@ static void be_rx_compl_process_gro(struct be_adapter *adapter, if (likely(!rxcp->vlanf)) napi_gro_frags(&eq_obj->napi); else - vlan_gro_frags(&eq_obj->napi, adapter->vlan_grp, rxcp->vid); + vlan_gro_frags(&eq_obj->napi, adapter->vlan_grp, + rxcp->vlan_tag); } static void be_parse_rx_compl_v1(struct be_adapter *adapter, @@ -1102,7 +1104,8 @@ static void be_parse_rx_compl_v1(struct be_adapter *adapter, rxcp->pkt_type = AMAP_GET_BITS(struct amap_eth_rx_compl_v1, cast_enc, compl); rxcp->vtm = AMAP_GET_BITS(struct amap_eth_rx_compl_v1, vtm, compl); - rxcp->vid = AMAP_GET_BITS(struct amap_eth_rx_compl_v1, vlan_tag, compl); + rxcp->vlan_tag = AMAP_GET_BITS(struct amap_eth_rx_compl_v1, vlan_tag, + compl); } static void be_parse_rx_compl_v0(struct be_adapter *adapter, @@ -1128,7 +1131,8 @@ static void be_parse_rx_compl_v0(struct be_adapter *adapter, rxcp->pkt_type = AMAP_GET_BITS(struct amap_eth_rx_compl_v0, cast_enc, compl); rxcp->vtm = AMAP_GET_BITS(struct amap_eth_rx_compl_v0, vtm, compl); - rxcp->vid = AMAP_GET_BITS(struct amap_eth_rx_compl_v0, vlan_tag, compl); + rxcp->vlan_tag = AMAP_GET_BITS(struct amap_eth_rx_compl_v0, vlan_tag, + compl); } static struct be_rx_compl_info *be_rx_compl_get(struct be_rx_obj *rxo) @@ -1155,9 +1159,11 @@ static struct be_rx_compl_info *be_rx_compl_get(struct be_rx_obj *rxo) rxcp->vlanf = 0; if (!lancer_chip(adapter)) - rxcp->vid = swab16(rxcp->vid); + rxcp->vlan_tag = swab16(rxcp->vlan_tag); - if ((adapter->pvid == rxcp->vid) && !adapter->vlan_tag[rxcp->vid]) + if (((adapter->pvid & VLAN_VID_MASK) == + (rxcp->vlan_tag & VLAN_VID_MASK)) && + !adapter->vlan_tag[rxcp->vlan_tag]) rxcp->vlanf = 0; /* As the compl has been parsed, reset it; we wont touch it again */ From 057bef938896e6266ae24ec4266d24792d27c29a Mon Sep 17 00:00:00 2001 From: Matvejchikov Ilya Date: Fri, 6 May 2011 06:23:09 +0000 Subject: [PATCH 10/27] NET: slip, fix ldisc->open retval TTY layer expects 0 if the ldisc->open operation succeeded. Signed-off-by : Matvejchikov Ilya Acked-by: Oliver Hartkopp Acked-by: Alan Cox Signed-off-by: David S. Miller --- drivers/net/slip.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/slip.c b/drivers/net/slip.c index 86cbb9ea2f26..8ec1a9a0bb9a 100644 --- a/drivers/net/slip.c +++ b/drivers/net/slip.c @@ -853,7 +853,9 @@ static int slip_open(struct tty_struct *tty) /* Done. We have linked the TTY line to a channel. */ rtnl_unlock(); tty->receive_room = 65536; /* We don't flow control */ - return sl->dev->base_addr; + + /* TTY layer expects 0 on success */ + return 0; err_free_bufs: sl_free_bufs(sl); From ce3dad0f74e6b240f0b1dedbd8ea268a3f298d82 Mon Sep 17 00:00:00 2001 From: Toshiharu Okada Date: Fri, 6 May 2011 02:53:51 +0000 Subject: [PATCH 11/27] PCH_GbE : Fixed the issue of collision detection The collision detection setting was invalid. When collision occurred, because data was not resent, there was an issue to which a transmitting throughput falls. This patch enables the collision detection. Signed-off-by: Toshiharu Okada Signed-off-by: David S. Miller --- drivers/net/pch_gbe/pch_gbe_main.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/pch_gbe/pch_gbe_main.c b/drivers/net/pch_gbe/pch_gbe_main.c index 2ef2f9cdefa6..4ebd1d4ad3ed 100644 --- a/drivers/net/pch_gbe/pch_gbe_main.c +++ b/drivers/net/pch_gbe/pch_gbe_main.c @@ -43,8 +43,7 @@ const char pch_driver_version[] = DRV_VERSION; #define PCH_GBE_MAC_RGMII_CTRL_SETTING ( \ PCH_GBE_CHIP_TYPE_INTERNAL | \ - PCH_GBE_RGMII_MODE_RGMII | \ - PCH_GBE_CRS_SEL \ + PCH_GBE_RGMII_MODE_RGMII \ ) /* Ethertype field values */ From 5d05a04d283061b586e8dc819cfa6f4b8cfd5948 Mon Sep 17 00:00:00 2001 From: Toshiharu Okada Date: Fri, 6 May 2011 02:53:56 +0000 Subject: [PATCH 12/27] PCH_GbE : Fixed the issue of checksum judgment The checksum judgment was mistaken. Judgment result 0:Correct 1:Wrong This patch fixes the issue. Signed-off-by: Toshiharu Okada Signed-off-by: David S. Miller --- drivers/net/pch_gbe/pch_gbe_main.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/pch_gbe/pch_gbe_main.c b/drivers/net/pch_gbe/pch_gbe_main.c index 4ebd1d4ad3ed..9f6c4025c6ea 100644 --- a/drivers/net/pch_gbe/pch_gbe_main.c +++ b/drivers/net/pch_gbe/pch_gbe_main.c @@ -1493,12 +1493,11 @@ pch_gbe_clean_rx(struct pch_gbe_adapter *adapter, /* Write meta date of skb */ skb_put(skb, length); skb->protocol = eth_type_trans(skb, netdev); - if ((tcp_ip_status & PCH_GBE_RXD_ACC_STAT_TCPIPOK) == - PCH_GBE_RXD_ACC_STAT_TCPIPOK) { - skb->ip_summed = CHECKSUM_UNNECESSARY; - } else { + if (tcp_ip_status & PCH_GBE_RXD_ACC_STAT_TCPIPOK) skb->ip_summed = CHECKSUM_NONE; - } + else + skb->ip_summed = CHECKSUM_UNNECESSARY; + napi_gro_receive(&adapter->napi, skb); (*work_done)++; pr_debug("Receive skb->ip_summed: %d length: %d\n", From b0e6baf5619a6fa3eaf43b55fdb4daa362c3c916 Mon Sep 17 00:00:00 2001 From: Tomoya Date: Mon, 9 May 2011 01:19:37 +0000 Subject: [PATCH 13/27] pch_gbe: support ML7223 IOH Support new device OKI SEMICONDUCTOR ML7223 IOH(Input/Output Hub). The ML7223 IOH is for MP(Media Phone) use. The ML7223 is companion chip for Intel Atom E6xx series. The ML7223 is completely compatible for Intel EG20T PCH. Signed-off-by: Tomoya MORINAGA Signed-off-by: David S. Miller --- drivers/net/Kconfig | 8 +++++++- drivers/net/pch_gbe/pch_gbe_main.c | 11 +++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index dc280bc8eba2..6c884ef1b069 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -2536,7 +2536,7 @@ config S6GMAC source "drivers/net/stmmac/Kconfig" config PCH_GBE - tristate "PCH Gigabit Ethernet" + tristate "Intel EG20T PCH / OKI SEMICONDUCTOR ML7223 IOH GbE" depends on PCI select MII ---help--- @@ -2548,6 +2548,12 @@ config PCH_GBE to Gigabit Ethernet. This driver enables Gigabit Ethernet function. + This driver also can be used for OKI SEMICONDUCTOR IOH(Input/ + Output Hub), ML7223. + ML7223 IOH is for MP(Media Phone) use. + ML7223 is companion chip for Intel Atom E6xx series. + ML7223 is completely compatible for Intel EG20T PCH. + endif # NETDEV_1000 # diff --git a/drivers/net/pch_gbe/pch_gbe_main.c b/drivers/net/pch_gbe/pch_gbe_main.c index 9f6c4025c6ea..56d049a472da 100644 --- a/drivers/net/pch_gbe/pch_gbe_main.c +++ b/drivers/net/pch_gbe/pch_gbe_main.c @@ -34,6 +34,10 @@ const char pch_driver_version[] = DRV_VERSION; #define PCH_GBE_COPYBREAK_DEFAULT 256 #define PCH_GBE_PCI_BAR 1 +/* Macros for ML7223 */ +#define PCI_VENDOR_ID_ROHM 0x10db +#define PCI_DEVICE_ID_ROHM_ML7223_GBE 0x8013 + #define PCH_GBE_TX_WEIGHT 64 #define PCH_GBE_RX_WEIGHT 64 #define PCH_GBE_RX_BUFFER_WRITE 16 @@ -2418,6 +2422,13 @@ static DEFINE_PCI_DEVICE_TABLE(pch_gbe_pcidev_id) = { .class = (PCI_CLASS_NETWORK_ETHERNET << 8), .class_mask = (0xFFFF00) }, + {.vendor = PCI_VENDOR_ID_ROHM, + .device = PCI_DEVICE_ID_ROHM_ML7223_GBE, + .subvendor = PCI_ANY_ID, + .subdevice = PCI_ANY_ID, + .class = (PCI_CLASS_NETWORK_ETHERNET << 8), + .class_mask = (0xFFFF00) + }, /* required last entry */ {0} }; From 315c34dae0069d0c67abd714bb846cd466289c7f Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 21 Apr 2011 10:55:07 +0200 Subject: [PATCH 14/27] netfilter: ctnetlink: fix timestamp support for new conntracks This patch fixes the missing initialization of the start time if the timestamp support is enabled. libnetfilter_conntrack/utils# conntrack -E & libnetfilter_conntrack/utils# ./conntrack_create tcp 6 109 ESTABLISHED src=1.1.1.1 dst=2.2.2.2 sport=1025 dport=21 packets=0 bytes=0 [UNREPLIED] src=2.2.2.2 dst=1.1.1.1 sport=21 dport=1025 packets=0 bytes=0 mark=0 delta-time=1303296401 use=2 Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_netlink.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 30bf8a167fc8..482e90c61850 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1334,6 +1334,7 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, struct nf_conn *ct; int err = -EINVAL; struct nf_conntrack_helper *helper; + struct nf_conn_tstamp *tstamp; ct = nf_conntrack_alloc(net, zone, otuple, rtuple, GFP_ATOMIC); if (IS_ERR(ct)) @@ -1451,6 +1452,9 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, __set_bit(IPS_EXPECTED_BIT, &ct->status); ct->master = master_ct; } + tstamp = nf_conn_tstamp_find(ct); + if (tstamp) + tstamp->start = ktime_to_ns(ktime_get_real()); add_timer(&ct->timeout); nf_conntrack_hash_insert(ct); From 5a6351eecf8c87afed9c883bb6341d09406d74ba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Apr 2011 10:57:21 +0200 Subject: [PATCH 15/27] netfilter: fix ebtables compat support commit 255d0dc34068a976 (netfilter: x_table: speedup compat operations) made ebtables not working anymore. 1) xt_compat_calc_jump() is not an exact match lookup 2) compat_table_info() has a typo in xt_compat_init_offsets() call 3) compat_do_replace() misses a xt_compat_init_offsets() call Reported-by: dann frazier Signed-off-by: Eric Dumazet Signed-off-by: Patrick McHardy --- net/bridge/netfilter/ebtables.c | 3 ++- net/netfilter/x_tables.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 893669caa8de..9707079bc40a 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1766,7 +1766,7 @@ static int compat_table_info(const struct ebt_table_info *info, newinfo->entries_size = size; - xt_compat_init_offsets(AF_INET, info->nentries); + xt_compat_init_offsets(NFPROTO_BRIDGE, info->nentries); return EBT_ENTRY_ITERATE(entries, size, compat_calc_entry, info, entries, newinfo); } @@ -2240,6 +2240,7 @@ static int compat_do_replace(struct net *net, void __user *user, xt_compat_lock(NFPROTO_BRIDGE); + xt_compat_init_offsets(NFPROTO_BRIDGE, tmp.nentries); ret = compat_copy_entries(entries_tmp, tmp.entries_size, &state); if (ret < 0) goto out_unlock; diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index a9adf4c6b299..8a025a585d2f 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -455,6 +455,7 @@ void xt_compat_flush_offsets(u_int8_t af) vfree(xt[af].compat_tab); xt[af].compat_tab = NULL; xt[af].number = 0; + xt[af].cur = 0; } } EXPORT_SYMBOL_GPL(xt_compat_flush_offsets); @@ -473,8 +474,7 @@ int xt_compat_calc_jump(u_int8_t af, unsigned int offset) else return mid ? tmp[mid - 1].delta : 0; } - WARN_ON_ONCE(1); - return 0; + return left ? tmp[left - 1].delta : 0; } EXPORT_SYMBOL_GPL(xt_compat_calc_jump); From 103a9778e07bcc0cd34b5c35a87281454eec719e Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 21 Apr 2011 10:58:25 +0200 Subject: [PATCH 16/27] netfilter: ebtables: only call xt_compat_add_offset once per rule The optimizations in commit 255d0dc34068a976 (netfilter: x_table: speedup compat operations) assume that xt_compat_add_offset is called once per rule. ebtables however called it for each match/target found in a rule. The match/watcher/target parser already returns the needed delta, so it is sufficient to move the xt_compat_add_offset call to a more reasonable location. While at it, also get rid of the unused COMPAT iterator macros. Signed-off-by: Florian Westphal Signed-off-by: Patrick McHardy --- net/bridge/netfilter/ebtables.c | 61 +++++---------------------------- 1 file changed, 9 insertions(+), 52 deletions(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 9707079bc40a..1a92b369c820 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1882,7 +1882,7 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, struct xt_match *match; struct xt_target *wt; void *dst = NULL; - int off, pad = 0, ret = 0; + int off, pad = 0; unsigned int size_kern, entry_offset, match_size = mwt->match_size; strlcpy(name, mwt->u.name, sizeof(name)); @@ -1935,13 +1935,6 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, break; } - if (!dst) { - ret = xt_compat_add_offset(NFPROTO_BRIDGE, entry_offset, - off + ebt_compat_entry_padsize()); - if (ret < 0) - return ret; - } - state->buf_kern_offset += match_size + off; state->buf_user_offset += match_size; pad = XT_ALIGN(size_kern) - size_kern; @@ -2016,50 +2009,6 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, return growth; } -#define EBT_COMPAT_WATCHER_ITERATE(e, fn, args...) \ -({ \ - unsigned int __i; \ - int __ret = 0; \ - struct compat_ebt_entry_mwt *__watcher; \ - \ - for (__i = e->watchers_offset; \ - __i < (e)->target_offset; \ - __i += __watcher->watcher_size + \ - sizeof(struct compat_ebt_entry_mwt)) { \ - __watcher = (void *)(e) + __i; \ - __ret = fn(__watcher , ## args); \ - if (__ret != 0) \ - break; \ - } \ - if (__ret == 0) { \ - if (__i != (e)->target_offset) \ - __ret = -EINVAL; \ - } \ - __ret; \ -}) - -#define EBT_COMPAT_MATCH_ITERATE(e, fn, args...) \ -({ \ - unsigned int __i; \ - int __ret = 0; \ - struct compat_ebt_entry_mwt *__match; \ - \ - for (__i = sizeof(struct ebt_entry); \ - __i < (e)->watchers_offset; \ - __i += __match->match_size + \ - sizeof(struct compat_ebt_entry_mwt)) { \ - __match = (void *)(e) + __i; \ - __ret = fn(__match , ## args); \ - if (__ret != 0) \ - break; \ - } \ - if (__ret == 0) { \ - if (__i != (e)->watchers_offset) \ - __ret = -EINVAL; \ - } \ - __ret; \ -}) - /* called for all ebt_entry structures. */ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, unsigned int *total, @@ -2132,6 +2081,14 @@ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base, } } + if (state->buf_kern_start == NULL) { + unsigned int offset = buf_start - (char *) base; + + ret = xt_compat_add_offset(NFPROTO_BRIDGE, offset, new_offset); + if (ret < 0) + return ret; + } + startoff = state->buf_user_offset - startoff; BUG_ON(*total < startoff); From 1ae132b0347907ac95b8bc9dba37934f59d2a508 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Tue, 3 May 2011 22:09:30 +0200 Subject: [PATCH 17/27] IPVS: Change of socket usage to enable name space exit. If the sync daemons run in a name space while it crashes or get killed, there is no way to stop them except for a reboot. When all patches are there, ip_vs_core will handle register_pernet_(), i.e. ip_vs_sync_init() and ip_vs_sync_cleanup() will be removed. Kernel threads should not increment the use count of a socket. By calling sk_change_net() after creating a socket this is avoided. sock_release cant be used intead sk_release_kernel() should be used. Thanks Eric W Biederman for your advices. Signed-off-by: Hans Schillstrom [horms@verge.net.au: minor edit to changelog] Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 2 +- net/netfilter/ipvs/ip_vs_sync.c | 58 +++++++++++++++++++++------------ 2 files changed, 38 insertions(+), 22 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 07accf6b2401..a0791dc05a27 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1896,7 +1896,7 @@ static int __net_init __ip_vs_init(struct net *net) static void __net_exit __ip_vs_cleanup(struct net *net) { - IP_VS_DBG(10, "ipvs netns %d released\n", net_ipvs(net)->gen); + IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); } static struct pernet_operations ipvs_core_ops = { diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 3e7961e85e9c..0cce95310820 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1303,13 +1303,18 @@ static struct socket *make_send_sock(struct net *net) struct socket *sock; int result; - /* First create a socket */ - result = __sock_create(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock, 1); + /* First create a socket move it to right name space later */ + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - + /* + * Kernel sockets that are a part of a namespace, should not + * hold a reference to a namespace in order to allow to stop it. + * After sk_change_net should be released using sk_release_kernel. + */ + sk_change_net(sock->sk, net); result = set_mcast_if(sock->sk, ipvs->master_mcast_ifn); if (result < 0) { pr_err("Error setting outbound mcast interface\n"); @@ -1334,8 +1339,8 @@ static struct socket *make_send_sock(struct net *net) return sock; - error: - sock_release(sock); +error: + sk_release_kernel(sock->sk); return ERR_PTR(result); } @@ -1350,12 +1355,17 @@ static struct socket *make_receive_sock(struct net *net) int result; /* First create a socket */ - result = __sock_create(net, PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock, 1); + result = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock); if (result < 0) { pr_err("Error during creation of socket; terminating\n"); return ERR_PTR(result); } - + /* + * Kernel sockets that are a part of a namespace, should not + * hold a reference to a namespace in order to allow to stop it. + * After sk_change_net should be released using sk_release_kernel. + */ + sk_change_net(sock->sk, net); /* it is equivalent to the REUSEADDR option in user-space */ sock->sk->sk_reuse = 1; @@ -1377,8 +1387,8 @@ static struct socket *make_receive_sock(struct net *net) return sock; - error: - sock_release(sock); +error: + sk_release_kernel(sock->sk); return ERR_PTR(result); } @@ -1473,7 +1483,7 @@ static int sync_thread_master(void *data) ip_vs_sync_buff_release(sb); /* release the sending multicast socket */ - sock_release(tinfo->sock); + sk_release_kernel(tinfo->sock->sk); kfree(tinfo); return 0; @@ -1513,7 +1523,7 @@ static int sync_thread_backup(void *data) } /* release the sending multicast socket */ - sock_release(tinfo->sock); + sk_release_kernel(tinfo->sock->sk); kfree(tinfo->buf); kfree(tinfo); @@ -1601,7 +1611,7 @@ outtinfo: outbuf: kfree(buf); outsocket: - sock_release(sock); + sk_release_kernel(sock->sk); out: return result; } @@ -1610,6 +1620,7 @@ out: int stop_sync_thread(struct net *net, int state) { struct netns_ipvs *ipvs = net_ipvs(net); + int retc = -EINVAL; IP_VS_DBG(7, "%s(): pid %d\n", __func__, task_pid_nr(current)); @@ -1629,7 +1640,7 @@ int stop_sync_thread(struct net *net, int state) spin_lock_bh(&ipvs->sync_lock); ipvs->sync_state &= ~IP_VS_STATE_MASTER; spin_unlock_bh(&ipvs->sync_lock); - kthread_stop(ipvs->master_thread); + retc = kthread_stop(ipvs->master_thread); ipvs->master_thread = NULL; } else if (state == IP_VS_STATE_BACKUP) { if (!ipvs->backup_thread) @@ -1639,16 +1650,14 @@ int stop_sync_thread(struct net *net, int state) task_pid_nr(ipvs->backup_thread)); ipvs->sync_state &= ~IP_VS_STATE_BACKUP; - kthread_stop(ipvs->backup_thread); + retc = kthread_stop(ipvs->backup_thread); ipvs->backup_thread = NULL; - } else { - return -EINVAL; } /* decrease the module use count */ ip_vs_use_count_dec(); - return 0; + return retc; } /* @@ -1670,8 +1679,15 @@ static int __net_init __ip_vs_sync_init(struct net *net) static void __ip_vs_sync_cleanup(struct net *net) { - stop_sync_thread(net, IP_VS_STATE_MASTER); - stop_sync_thread(net, IP_VS_STATE_BACKUP); + int retc; + + retc = stop_sync_thread(net, IP_VS_STATE_MASTER); + if (retc && retc != -ESRCH) + pr_err("Failed to stop Master Daemon\n"); + + retc = stop_sync_thread(net, IP_VS_STATE_BACKUP); + if (retc && retc != -ESRCH) + pr_err("Failed to stop Backup Daemon\n"); } static struct pernet_operations ipvs_sync_ops = { @@ -1682,10 +1698,10 @@ static struct pernet_operations ipvs_sync_ops = { int __init ip_vs_sync_init(void) { - return register_pernet_subsys(&ipvs_sync_ops); + return register_pernet_device(&ipvs_sync_ops); } void ip_vs_sync_cleanup(void) { - unregister_pernet_subsys(&ipvs_sync_ops); + unregister_pernet_device(&ipvs_sync_ops); } From 7a4f0761fce32ff4918a7c23b08db564ad33092d Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Tue, 3 May 2011 22:09:31 +0200 Subject: [PATCH 18/27] IPVS: init and cleanup restructuring DESCRIPTION This patch tries to restore the initial init and cleanup sequences that was before namspace patch. Netns also requires action when net devices unregister which has never been implemented. I.e this patch also covers when a device moves into a network namespace, and has to be released. IMPLEMENTATION The number of calls to register_pernet_device have been reduced to one for the ip_vs.ko Schedulers still have their own calls. This patch adds a function __ip_vs_service_cleanup() and an enable flag for the netfilter hooks. The nf hooks will be enabled when the first service is loaded and never disabled again, except when a namespace exit starts. Signed-off-by: Hans Schillstrom Acked-by: Julian Anastasov [horms@verge.net.au: minor edit to changelog] Signed-off-by: Simon Horman --- include/net/ip_vs.h | 17 +++++ net/netfilter/ipvs/ip_vs_app.c | 15 +--- net/netfilter/ipvs/ip_vs_conn.c | 12 +--- net/netfilter/ipvs/ip_vs_core.c | 101 +++++++++++++++++++++++--- net/netfilter/ipvs/ip_vs_ctl.c | 120 ++++++++++++++++++++++++++----- net/netfilter/ipvs/ip_vs_est.c | 14 +--- net/netfilter/ipvs/ip_vs_proto.c | 11 +-- net/netfilter/ipvs/ip_vs_sync.c | 13 +--- 8 files changed, 223 insertions(+), 80 deletions(-) diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index d516f00c8e0f..86aefed6140b 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -791,6 +791,7 @@ struct ip_vs_app { /* IPVS in network namespace */ struct netns_ipvs { int gen; /* Generation */ + int enable; /* enable like nf_hooks do */ /* * Hash table: for real service lookups */ @@ -1089,6 +1090,22 @@ ip_vs_control_add(struct ip_vs_conn *cp, struct ip_vs_conn *ctl_cp) atomic_inc(&ctl_cp->n_control); } +/* + * IPVS netns init & cleanup functions + */ +extern int __ip_vs_estimator_init(struct net *net); +extern int __ip_vs_control_init(struct net *net); +extern int __ip_vs_protocol_init(struct net *net); +extern int __ip_vs_app_init(struct net *net); +extern int __ip_vs_conn_init(struct net *net); +extern int __ip_vs_sync_init(struct net *net); +extern void __ip_vs_conn_cleanup(struct net *net); +extern void __ip_vs_app_cleanup(struct net *net); +extern void __ip_vs_protocol_cleanup(struct net *net); +extern void __ip_vs_control_cleanup(struct net *net); +extern void __ip_vs_estimator_cleanup(struct net *net); +extern void __ip_vs_sync_cleanup(struct net *net); +extern void __ip_vs_service_cleanup(struct net *net); /* * IPVS application functions diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index 2dc6de13ac18..51f3af7c4743 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -576,7 +576,7 @@ static const struct file_operations ip_vs_app_fops = { }; #endif -static int __net_init __ip_vs_app_init(struct net *net) +int __net_init __ip_vs_app_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -585,26 +585,17 @@ static int __net_init __ip_vs_app_init(struct net *net) return 0; } -static void __net_exit __ip_vs_app_cleanup(struct net *net) +void __net_exit __ip_vs_app_cleanup(struct net *net) { proc_net_remove(net, "ip_vs_app"); } -static struct pernet_operations ip_vs_app_ops = { - .init = __ip_vs_app_init, - .exit = __ip_vs_app_cleanup, -}; - int __init ip_vs_app_init(void) { - int rv; - - rv = register_pernet_subsys(&ip_vs_app_ops); - return rv; + return 0; } void ip_vs_app_cleanup(void) { - unregister_pernet_subsys(&ip_vs_app_ops); } diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index c97bd45975be..d3fd91bbba49 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1258,22 +1258,17 @@ int __net_init __ip_vs_conn_init(struct net *net) return 0; } -static void __net_exit __ip_vs_conn_cleanup(struct net *net) +void __net_exit __ip_vs_conn_cleanup(struct net *net) { /* flush all the connection entries first */ ip_vs_conn_flush(net); proc_net_remove(net, "ip_vs_conn"); proc_net_remove(net, "ip_vs_conn_sync"); } -static struct pernet_operations ipvs_conn_ops = { - .init = __ip_vs_conn_init, - .exit = __ip_vs_conn_cleanup, -}; int __init ip_vs_conn_init(void) { int idx; - int retc; /* Compute size and mask */ ip_vs_conn_tab_size = 1 << ip_vs_conn_tab_bits; @@ -1309,17 +1304,14 @@ int __init ip_vs_conn_init(void) rwlock_init(&__ip_vs_conntbl_lock_array[idx].l); } - retc = register_pernet_subsys(&ipvs_conn_ops); - /* calculate the random value for connection hash */ get_random_bytes(&ip_vs_conn_rnd, sizeof(ip_vs_conn_rnd)); - return retc; + return 0; } void ip_vs_conn_cleanup(void) { - unregister_pernet_subsys(&ipvs_conn_ops); /* Release the empty cache */ kmem_cache_destroy(ip_vs_conn_cachep); vfree(ip_vs_conn_tab); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index a0791dc05a27..a74dae6c5dbc 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1113,6 +1113,9 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) return NF_ACCEPT; net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { @@ -1343,6 +1346,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) return NF_ACCEPT; /* The packet looks wrong, ignore */ net = skb_net(skb); + pd = ip_vs_proto_data_get(net, cih->protocol); if (!pd) return NF_ACCEPT; @@ -1529,6 +1533,11 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); return NF_ACCEPT; } + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); /* Bad... Do not break raw sockets */ @@ -1562,7 +1571,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } - net = skb_net(skb); /* Protocol supported? */ pd = ip_vs_proto_data_get(net, iph.protocol); if (unlikely(!pd)) @@ -1588,7 +1596,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) } IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); - net = skb_net(skb); ipvs = net_ipvs(net); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { @@ -1743,10 +1750,16 @@ ip_vs_forward_icmp(unsigned int hooknum, struct sk_buff *skb, int (*okfn)(struct sk_buff *)) { int r; + struct net *net; if (ip_hdr(skb)->protocol != IPPROTO_ICMP) return NF_ACCEPT; + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + return ip_vs_in_icmp(skb, &r, hooknum); } @@ -1757,10 +1770,16 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, int (*okfn)(struct sk_buff *)) { int r; + struct net *net; if (ipv6_hdr(skb)->nexthdr != IPPROTO_ICMPV6) return NF_ACCEPT; + /* ipvs enabled in this netns ? */ + net = skb_net(skb); + if (!net_ipvs(net)->enable) + return NF_ACCEPT; + return ip_vs_in_icmp_v6(skb, &r, hooknum); } #endif @@ -1884,21 +1903,72 @@ static int __net_init __ip_vs_init(struct net *net) pr_err("%s(): no memory.\n", __func__); return -ENOMEM; } + /* Hold the beast until a service is registerd */ + ipvs->enable = 0; ipvs->net = net; /* Counters used for creating unique names */ ipvs->gen = atomic_read(&ipvs_netns_cnt); atomic_inc(&ipvs_netns_cnt); net->ipvs = ipvs; + + if (__ip_vs_estimator_init(net) < 0) + goto estimator_fail; + + if (__ip_vs_control_init(net) < 0) + goto control_fail; + + if (__ip_vs_protocol_init(net) < 0) + goto protocol_fail; + + if (__ip_vs_app_init(net) < 0) + goto app_fail; + + if (__ip_vs_conn_init(net) < 0) + goto conn_fail; + + if (__ip_vs_sync_init(net) < 0) + goto sync_fail; + printk(KERN_INFO "IPVS: Creating netns size=%zu id=%d\n", sizeof(struct netns_ipvs), ipvs->gen); return 0; +/* + * Error handling + */ + +sync_fail: + __ip_vs_conn_cleanup(net); +conn_fail: + __ip_vs_app_cleanup(net); +app_fail: + __ip_vs_protocol_cleanup(net); +protocol_fail: + __ip_vs_control_cleanup(net); +control_fail: + __ip_vs_estimator_cleanup(net); +estimator_fail: + return -ENOMEM; } static void __net_exit __ip_vs_cleanup(struct net *net) { + __ip_vs_service_cleanup(net); /* ip_vs_flush() with locks */ + __ip_vs_conn_cleanup(net); + __ip_vs_app_cleanup(net); + __ip_vs_protocol_cleanup(net); + __ip_vs_control_cleanup(net); + __ip_vs_estimator_cleanup(net); IP_VS_DBG(2, "ipvs netns %d released\n", net_ipvs(net)->gen); } +static void __net_exit __ip_vs_dev_cleanup(struct net *net) +{ + EnterFunction(2); + net_ipvs(net)->enable = 0; /* Disable packet reception */ + __ip_vs_sync_cleanup(net); + LeaveFunction(2); +} + static struct pernet_operations ipvs_core_ops = { .init = __ip_vs_init, .exit = __ip_vs_cleanup, @@ -1906,6 +1976,10 @@ static struct pernet_operations ipvs_core_ops = { .size = sizeof(struct netns_ipvs), }; +static struct pernet_operations ipvs_core_dev_ops = { + .exit = __ip_vs_dev_cleanup, +}; + /* * Initialize IP Virtual Server */ @@ -1913,10 +1987,6 @@ static int __init ip_vs_init(void) { int ret; - ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ - if (ret < 0) - return ret; - ip_vs_estimator_init(); ret = ip_vs_control_init(); if (ret < 0) { @@ -1944,15 +2014,28 @@ static int __init ip_vs_init(void) goto cleanup_conn; } + ret = register_pernet_subsys(&ipvs_core_ops); /* Alloc ip_vs struct */ + if (ret < 0) + goto cleanup_sync; + + ret = register_pernet_device(&ipvs_core_dev_ops); + if (ret < 0) + goto cleanup_sub; + ret = nf_register_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); if (ret < 0) { pr_err("can't register hooks.\n"); - goto cleanup_sync; + goto cleanup_dev; } pr_info("ipvs loaded.\n"); + return ret; +cleanup_dev: + unregister_pernet_device(&ipvs_core_dev_ops); +cleanup_sub: + unregister_pernet_subsys(&ipvs_core_ops); cleanup_sync: ip_vs_sync_cleanup(); cleanup_conn: @@ -1964,20 +2047,20 @@ cleanup_sync: ip_vs_control_cleanup(); cleanup_estimator: ip_vs_estimator_cleanup(); - unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ return ret; } static void __exit ip_vs_cleanup(void) { nf_unregister_hooks(ip_vs_ops, ARRAY_SIZE(ip_vs_ops)); + unregister_pernet_device(&ipvs_core_dev_ops); + unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ ip_vs_sync_cleanup(); ip_vs_conn_cleanup(); ip_vs_app_cleanup(); ip_vs_protocol_cleanup(); ip_vs_control_cleanup(); ip_vs_estimator_cleanup(); - unregister_pernet_subsys(&ipvs_core_ops); /* free ip_vs struct */ pr_info("ipvs unloaded.\n"); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index ae47090bf45f..ea722810faf3 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -69,6 +69,11 @@ int ip_vs_get_debug_level(void) } #endif + +/* Protos */ +static void __ip_vs_del_service(struct ip_vs_service *svc); + + #ifdef CONFIG_IP_VS_IPV6 /* Taken from rt6_fill_node() in net/ipv6/route.c, is there a better way? */ static int __ip_vs_addr_is_local_v6(struct net *net, @@ -1214,6 +1219,8 @@ ip_vs_add_service(struct net *net, struct ip_vs_service_user_kern *u, write_unlock_bh(&__ip_vs_svc_lock); *svc_p = svc; + /* Now there is a service - full throttle */ + ipvs->enable = 1; return 0; @@ -1472,6 +1479,84 @@ static int ip_vs_flush(struct net *net) return 0; } +/* + * Delete service by {netns} in the service table. + * Called by __ip_vs_cleanup() + */ +void __ip_vs_service_cleanup(struct net *net) +{ + EnterFunction(2); + /* Check for "full" addressed entries */ + mutex_lock(&__ip_vs_mutex); + ip_vs_flush(net); + mutex_unlock(&__ip_vs_mutex); + LeaveFunction(2); +} +/* + * Release dst hold by dst_cache + */ +static inline void +__ip_vs_dev_reset(struct ip_vs_dest *dest, struct net_device *dev) +{ + spin_lock_bh(&dest->dst_lock); + if (dest->dst_cache && dest->dst_cache->dev == dev) { + IP_VS_DBG_BUF(3, "Reset dev:%s dest %s:%u ,dest->refcnt=%d\n", + dev->name, + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port), + atomic_read(&dest->refcnt)); + ip_vs_dst_reset(dest); + } + spin_unlock_bh(&dest->dst_lock); + +} +/* + * Netdev event receiver + * Currently only NETDEV_UNREGISTER is handled, i.e. if we hold a reference to + * a device that is "unregister" it must be released. + */ +static int ip_vs_dst_event(struct notifier_block *this, unsigned long event, + void *ptr) +{ + struct net_device *dev = ptr; + struct net *net = dev_net(dev); + struct ip_vs_service *svc; + struct ip_vs_dest *dest; + unsigned int idx; + + if (event != NETDEV_UNREGISTER) + return NOTIFY_DONE; + IP_VS_DBG(3, "%s() dev=%s\n", __func__, dev->name); + EnterFunction(2); + mutex_lock(&__ip_vs_mutex); + for (idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + list_for_each_entry(svc, &ip_vs_svc_table[idx], s_list) { + if (net_eq(svc->net, net)) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + __ip_vs_dev_reset(dest, dev); + } + } + } + + list_for_each_entry(svc, &ip_vs_svc_fwm_table[idx], f_list) { + if (net_eq(svc->net, net)) { + list_for_each_entry(dest, &svc->destinations, + n_list) { + __ip_vs_dev_reset(dest, dev); + } + } + + } + } + + list_for_each_entry(dest, &net_ipvs(net)->dest_trash, n_list) { + __ip_vs_dev_reset(dest, dev); + } + mutex_unlock(&__ip_vs_mutex); + LeaveFunction(2); + return NOTIFY_DONE; +} /* * Zero counters in a service or all services @@ -3588,6 +3673,10 @@ void __net_init __ip_vs_control_cleanup_sysctl(struct net *net) { } #endif +static struct notifier_block ip_vs_dst_notifier = { + .notifier_call = ip_vs_dst_event, +}; + int __net_init __ip_vs_control_init(struct net *net) { int idx; @@ -3626,7 +3715,7 @@ err: return -ENOMEM; } -static void __net_exit __ip_vs_control_cleanup(struct net *net) +void __net_exit __ip_vs_control_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -3639,11 +3728,6 @@ static void __net_exit __ip_vs_control_cleanup(struct net *net) free_percpu(ipvs->tot_stats.cpustats); } -static struct pernet_operations ipvs_control_ops = { - .init = __ip_vs_control_init, - .exit = __ip_vs_control_cleanup, -}; - int __init ip_vs_control_init(void) { int idx; @@ -3657,33 +3741,32 @@ int __init ip_vs_control_init(void) INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); } - ret = register_pernet_subsys(&ipvs_control_ops); - if (ret) { - pr_err("cannot register namespace.\n"); - goto err; - } - smp_wmb(); /* Do we really need it now ? */ ret = nf_register_sockopt(&ip_vs_sockopts); if (ret) { pr_err("cannot register sockopt.\n"); - goto err_net; + goto err_sock; } ret = ip_vs_genl_register(); if (ret) { pr_err("cannot register Generic Netlink interface.\n"); - nf_unregister_sockopt(&ip_vs_sockopts); - goto err_net; + goto err_genl; } + ret = register_netdevice_notifier(&ip_vs_dst_notifier); + if (ret < 0) + goto err_notf; + LeaveFunction(2); return 0; -err_net: - unregister_pernet_subsys(&ipvs_control_ops); -err: +err_notf: + ip_vs_genl_unregister(); +err_genl: + nf_unregister_sockopt(&ip_vs_sockopts); +err_sock: return ret; } @@ -3691,7 +3774,6 @@ err: void ip_vs_control_cleanup(void) { EnterFunction(2); - unregister_pernet_subsys(&ipvs_control_ops); ip_vs_genl_unregister(); nf_unregister_sockopt(&ip_vs_sockopts); LeaveFunction(2); diff --git a/net/netfilter/ipvs/ip_vs_est.c b/net/netfilter/ipvs/ip_vs_est.c index 8c8766ca56ad..508cce98777c 100644 --- a/net/netfilter/ipvs/ip_vs_est.c +++ b/net/netfilter/ipvs/ip_vs_est.c @@ -192,7 +192,7 @@ void ip_vs_read_estimator(struct ip_vs_stats_user *dst, dst->outbps = (e->outbps + 0xF) >> 5; } -static int __net_init __ip_vs_estimator_init(struct net *net) +int __net_init __ip_vs_estimator_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -203,24 +203,16 @@ static int __net_init __ip_vs_estimator_init(struct net *net) return 0; } -static void __net_exit __ip_vs_estimator_exit(struct net *net) +void __net_exit __ip_vs_estimator_cleanup(struct net *net) { del_timer_sync(&net_ipvs(net)->est_timer); } -static struct pernet_operations ip_vs_app_ops = { - .init = __ip_vs_estimator_init, - .exit = __ip_vs_estimator_exit, -}; int __init ip_vs_estimator_init(void) { - int rv; - - rv = register_pernet_subsys(&ip_vs_app_ops); - return rv; + return 0; } void ip_vs_estimator_cleanup(void) { - unregister_pernet_subsys(&ip_vs_app_ops); } diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index 17484a4416ef..eb86028536fc 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -316,7 +316,7 @@ ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, /* * per network name-space init */ -static int __net_init __ip_vs_protocol_init(struct net *net) +int __net_init __ip_vs_protocol_init(struct net *net) { #ifdef CONFIG_IP_VS_PROTO_TCP register_ip_vs_proto_netns(net, &ip_vs_protocol_tcp); @@ -336,7 +336,7 @@ static int __net_init __ip_vs_protocol_init(struct net *net) return 0; } -static void __net_exit __ip_vs_protocol_cleanup(struct net *net) +void __net_exit __ip_vs_protocol_cleanup(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); struct ip_vs_proto_data *pd; @@ -349,11 +349,6 @@ static void __net_exit __ip_vs_protocol_cleanup(struct net *net) } } -static struct pernet_operations ipvs_proto_ops = { - .init = __ip_vs_protocol_init, - .exit = __ip_vs_protocol_cleanup, -}; - int __init ip_vs_protocol_init(void) { char protocols[64]; @@ -382,7 +377,6 @@ int __init ip_vs_protocol_init(void) REGISTER_PROTOCOL(&ip_vs_protocol_esp); #endif pr_info("Registered protocols (%s)\n", &protocols[2]); - return register_pernet_subsys(&ipvs_proto_ops); return 0; } @@ -393,7 +387,6 @@ void ip_vs_protocol_cleanup(void) struct ip_vs_protocol *pp; int i; - unregister_pernet_subsys(&ipvs_proto_ops); /* unregister all the ipvs protocols */ for (i = 0; i < IP_VS_PROTO_TAB_SIZE; i++) { while ((pp = ip_vs_proto_table[i]) != NULL) diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 0cce95310820..e292e5bddc70 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -1663,7 +1663,7 @@ int stop_sync_thread(struct net *net, int state) /* * Initialize data struct for each netns */ -static int __net_init __ip_vs_sync_init(struct net *net) +int __net_init __ip_vs_sync_init(struct net *net) { struct netns_ipvs *ipvs = net_ipvs(net); @@ -1677,7 +1677,7 @@ static int __net_init __ip_vs_sync_init(struct net *net) return 0; } -static void __ip_vs_sync_cleanup(struct net *net) +void __ip_vs_sync_cleanup(struct net *net) { int retc; @@ -1690,18 +1690,11 @@ static void __ip_vs_sync_cleanup(struct net *net) pr_err("Failed to stop Backup Daemon\n"); } -static struct pernet_operations ipvs_sync_ops = { - .init = __ip_vs_sync_init, - .exit = __ip_vs_sync_cleanup, -}; - - int __init ip_vs_sync_init(void) { - return register_pernet_device(&ipvs_sync_ops); + return 0; } void ip_vs_sync_cleanup(void) { - unregister_pernet_device(&ipvs_sync_ops); } From 4319cc0cf5bb894b7368008cdf6dd20eb8868018 Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao Date: Tue, 10 May 2011 09:55:44 +0200 Subject: [PATCH 19/27] netfilter: IPv6: initialize TOS field in REJECT target module The IPv6 header is not zeroed out in alloc_skb so we must initialize it properly unless we want to see IPv6 packets with random TOS fields floating around. The current implementation resets the flow label but this could be changed if deemed necessary. We stumbled upon this issue when trying to apply a mangle rule to the RST packet generated by the REJECT target module. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Pablo Neira Ayuso --- net/ipv6/netfilter/ip6t_REJECT.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv6/netfilter/ip6t_REJECT.c b/net/ipv6/netfilter/ip6t_REJECT.c index 28e74488a329..a5a4c5dd5396 100644 --- a/net/ipv6/netfilter/ip6t_REJECT.c +++ b/net/ipv6/netfilter/ip6t_REJECT.c @@ -45,6 +45,8 @@ static void send_reset(struct net *net, struct sk_buff *oldskb) int tcphoff, needs_ack; const struct ipv6hdr *oip6h = ipv6_hdr(oldskb); struct ipv6hdr *ip6h; +#define DEFAULT_TOS_VALUE 0x0U + const __u8 tclass = DEFAULT_TOS_VALUE; struct dst_entry *dst = NULL; u8 proto; struct flowi6 fl6; @@ -124,7 +126,7 @@ static void send_reset(struct net *net, struct sk_buff *oldskb) skb_put(nskb, sizeof(struct ipv6hdr)); skb_reset_network_header(nskb); ip6h = ipv6_hdr(nskb); - ip6h->version = 6; + *(__be32 *)ip6h = htonl(0x60000000 | (tclass << 20)); ip6h->hop_limit = ip6_dst_hoplimit(dst); ip6h->nexthdr = IPPROTO_TCP; ipv6_addr_copy(&ip6h->saddr, &oip6h->daddr); From 1ed2f73d90fb49bcf5704aee7e9084adb882bfc5 Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao Date: Tue, 10 May 2011 10:00:21 +0200 Subject: [PATCH 20/27] netfilter: IPv6: fix DSCP mangle code The mask indicates the bits one wants to zero out, so it needs to be inverted before applying to the original TOS field. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_DSCP.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_DSCP.c b/net/netfilter/xt_DSCP.c index 0a229191e55b..ae8271652efa 100644 --- a/net/netfilter/xt_DSCP.c +++ b/net/netfilter/xt_DSCP.c @@ -99,7 +99,7 @@ tos_tg6(struct sk_buff *skb, const struct xt_action_param *par) u_int8_t orig, nv; orig = ipv6_get_dsfield(iph); - nv = (orig & info->tos_mask) ^ info->tos_value; + nv = (orig & ~info->tos_mask) ^ info->tos_value; if (orig != nv) { if (!skb_make_writable(skb, sizeof(struct iphdr))) From 93bbce1ad0cd788190dd7d6c17d289f771fe3d0d Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 10 May 2011 12:13:36 +0200 Subject: [PATCH 21/27] netfilter: revert a2361c8735e07322023aedc36e4938b35af31eb0 This patch reverts a2361c8735e07322023aedc36e4938b35af31eb0: "[PATCH] netfilter: xt_conntrack: warn about use in raw table" Florian Wesphal says: "... when the packet was sent from the local machine the skb already has ->nfct attached, and -m conntrack seems to do the right thing." Acked-by: Jan Engelhardt Reported-by: Florian Wesphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_conntrack.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/net/netfilter/xt_conntrack.c b/net/netfilter/xt_conntrack.c index 481a86fdc409..61805d7b38aa 100644 --- a/net/netfilter/xt_conntrack.c +++ b/net/netfilter/xt_conntrack.c @@ -272,11 +272,6 @@ static int conntrack_mt_check(const struct xt_mtchk_param *par) { int ret; - if (strcmp(par->table, "raw") == 0) { - pr_info("state is undetermined at the time of raw table\n"); - return -EINVAL; - } - ret = nf_ct_l3proto_try_module_get(par->family); if (ret < 0) pr_info("cannot load conntrack support for proto=%u\n", From 55aee10dec477254241e4f72968f92e0543b33ad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 May 2011 12:22:54 -0700 Subject: [PATCH 22/27] vlan: fix GVRP at dismantle time MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ip link add link eth2 eth2.103 type vlan id 103 gvrp on loose_binding on ip link set eth2.103 up rmmod tg3 # driver providing eth2 BUG: unable to handle kernel NULL pointer dereference at (null) IP: [] garp_request_leave+0x3e/0xc0 [garp] PGD 11d251067 PUD 11b9e0067 PMD 0 Oops: 0000 [#1] SMP last sysfs file: /sys/devices/virtual/net/eth2.104/ifindex CPU 0 Modules linked in: tg3(-) 8021q garp nfsd lockd auth_rpcgss sunrpc libphy sg [last unloaded: x_tables] Pid: 11494, comm: rmmod Tainted: G W 2.6.39-rc6-00261-gfd71257-dirty #580 HP ProLiant BL460c G6 RIP: 0010:[] [] garp_request_leave+0x3e/0xc0 [garp] RSP: 0018:ffff88007a19bae8 EFLAGS: 00010286 RAX: 0000000000000000 RBX: ffff88011b5e2000 RCX: 0000000000000002 RDX: 0000000000000000 RSI: 0000000000000175 RDI: ffffffffa0030d5b RBP: ffff88007a19bb18 R08: 0000000000000001 R09: ffff88011bd64a00 R10: ffff88011d34ec00 R11: 0000000000000000 R12: 0000000000000002 R13: ffff88007a19bc48 R14: ffff88007a19bb88 R15: 0000000000000001 FS: 0000000000000000(0000) GS:ffff88011fc00000(0063) knlGS:00000000f77d76c0 CS: 0010 DS: 002b ES: 002b CR0: 000000008005003b CR2: 0000000000000000 CR3: 000000011a675000 CR4: 00000000000006f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process rmmod (pid: 11494, threadinfo ffff88007a19a000, task ffff8800798595c0) Stack: ffff88007a19bb36 ffff88011c84b800 ffff88011b5e2000 ffff88007a19bc48 ffff88007a19bb88 0000000000000006 ffff88007a19bb38 ffffffffa003a5f6 ffff88007a19bb38 670088007a19bba8 ffff88007a19bb58 ffffffffa00397e7 Call Trace: [] vlan_gvrp_request_leave+0x46/0x50 [8021q] [] vlan_dev_stop+0xb7/0xc0 [8021q] [] __dev_close_many+0x87/0xe0 [] dev_close_many+0x87/0x110 [] rollback_registered_many+0xa0/0x240 [] unregister_netdevice_many+0x19/0x60 [] vlan_device_event+0x53b/0x550 [8021q] [] ? ip6mr_device_event+0xa8/0xd0 [] notifier_call_chain+0x53/0x80 [] __raw_notifier_call_chain+0x9/0x10 [] raw_notifier_call_chain+0x11/0x20 [] call_netdevice_notifiers+0x32/0x60 [] rollback_registered_many+0x10f/0x240 [] rollback_registered+0x2f/0x40 [] unregister_netdevice_queue+0x58/0x90 [] unregister_netdev+0x1b/0x30 [] tg3_remove_one+0x6f/0x10b [tg3] We should call vlan_gvrp_request_leave() from unregister_vlan_dev(), not from vlan_dev_stop(), because vlan_gvrp_uninit_applicant() is called right after unregister_netdevice_queue(). In batch mode, unregister_netdevice_queue() doesn’t immediately call vlan_dev_stop(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/8021q/vlan.c | 3 +++ net/8021q/vlan_dev.c | 3 --- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 7850412f52b7..0eb1a886b370 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -124,6 +124,9 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) grp->nr_vlans--; + if (vlan->flags & VLAN_FLAG_GVRP) + vlan_gvrp_request_leave(dev); + vlan_group_set_device(grp, vlan_id, NULL); if (!grp->killall) synchronize_net(); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index e34ea9e5e28b..b2ff6c8d3603 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -487,9 +487,6 @@ static int vlan_dev_stop(struct net_device *dev) struct vlan_dev_info *vlan = vlan_dev_info(dev); struct net_device *real_dev = vlan->real_dev; - if (vlan->flags & VLAN_FLAG_GVRP) - vlan_gvrp_request_leave(dev); - dev_mc_unsync(real_dev, dev); dev_uc_unsync(real_dev, dev); if (dev->flags & IFF_ALLMULTI) From e14a599335427f81bbb0008963e59aa9c6449dce Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 10 May 2011 12:26:06 -0700 Subject: [PATCH 23/27] net: dev_close() should check IFF_UP Commit 443457242beb (factorize sync-rcu call in unregister_netdevice_many) mistakenly removed one test from dev_close() Following actions trigger a BUG : modprobe bonding modprobe dummy ifconfig bond0 up ifenslave bond0 dummy0 rmmod dummy dev_close() must not close a non IFF_UP device. With help from Frank Blaschka and Einar EL Lueck Reported-by: Frank Blaschka Reported-by: Einar EL Lueck Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 856b6ee9a1d5..92009440d28b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1284,11 +1284,13 @@ static int dev_close_many(struct list_head *head) */ int dev_close(struct net_device *dev) { - LIST_HEAD(single); + if (dev->flags & IFF_UP) { + LIST_HEAD(single); - list_add(&dev->unreg_list, &single); - dev_close_many(&single); - list_del(&single); + list_add(&dev->unreg_list, &single); + dev_close_many(&single); + list_del(&single); + } return 0; } EXPORT_SYMBOL(dev_close); From 43a4dea4c9d44baae38ddc14b9b6d86fde4c8b88 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 9 May 2011 19:36:38 +0000 Subject: [PATCH 24/27] xfrm: Assign the inner mode output function to the dst entry As it is, we assign the outer modes output function to the dst entry when we create the xfrm bundle. This leads to two problems on interfamily scenarios. We might insert ipv4 packets into ip6_fragment when called from xfrm6_output. The system crashes if we try to fragment an ipv4 packet with ip6_fragment. This issue was introduced with git commit ad0081e4 (ipv6: Fragment locally generated tunnel-mode IPSec6 packets as needed). The second issue is, that we might insert ipv4 packets in netfilter6 and vice versa on interfamily scenarios. With this patch we assign the inner mode output function to the dst entry when we create the xfrm bundle. So xfrm4_output/xfrm6_output from the inner mode is used and the right fragmentation and netfilter functions are called. We switch then to outer mode with the output_finish functions. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- include/net/xfrm.h | 3 +++ net/ipv4/xfrm4_output.c | 8 ++++++-- net/ipv4/xfrm4_state.c | 1 + net/ipv6/xfrm6_output.c | 6 +++--- net/ipv6/xfrm6_state.c | 1 + net/xfrm/xfrm_policy.c | 14 +++++++++++++- 6 files changed, 27 insertions(+), 6 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 6ae4bc5ce8a7..20afeaa39395 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -324,6 +324,7 @@ struct xfrm_state_afinfo { int (*tmpl_sort)(struct xfrm_tmpl **dst, struct xfrm_tmpl **src, int n); int (*state_sort)(struct xfrm_state **dst, struct xfrm_state **src, int n); int (*output)(struct sk_buff *skb); + int (*output_finish)(struct sk_buff *skb); int (*extract_input)(struct xfrm_state *x, struct sk_buff *skb); int (*extract_output)(struct xfrm_state *x, @@ -1454,6 +1455,7 @@ static inline int xfrm4_rcv_spi(struct sk_buff *skb, int nexthdr, __be32 spi) extern int xfrm4_extract_output(struct xfrm_state *x, struct sk_buff *skb); extern int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb); extern int xfrm4_output(struct sk_buff *skb); +extern int xfrm4_output_finish(struct sk_buff *skb); extern int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family); extern int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family); extern int xfrm6_extract_header(struct sk_buff *skb); @@ -1470,6 +1472,7 @@ extern __be32 xfrm6_tunnel_spi_lookup(struct net *net, xfrm_address_t *saddr); extern int xfrm6_extract_output(struct xfrm_state *x, struct sk_buff *skb); extern int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb); extern int xfrm6_output(struct sk_buff *skb); +extern int xfrm6_output_finish(struct sk_buff *skb); extern int xfrm6_find_1stfragopt(struct xfrm_state *x, struct sk_buff *skb, u8 **prevhdr); diff --git a/net/ipv4/xfrm4_output.c b/net/ipv4/xfrm4_output.c index 571aa96a175c..2d51840e53a1 100644 --- a/net/ipv4/xfrm4_output.c +++ b/net/ipv4/xfrm4_output.c @@ -69,7 +69,7 @@ int xfrm4_prepare_output(struct xfrm_state *x, struct sk_buff *skb) } EXPORT_SYMBOL(xfrm4_prepare_output); -static int xfrm4_output_finish(struct sk_buff *skb) +int xfrm4_output_finish(struct sk_buff *skb) { #ifdef CONFIG_NETFILTER if (!skb_dst(skb)->xfrm) { @@ -86,7 +86,11 @@ static int xfrm4_output_finish(struct sk_buff *skb) int xfrm4_output(struct sk_buff *skb) { + struct dst_entry *dst = skb_dst(skb); + struct xfrm_state *x = dst->xfrm; + return NF_HOOK_COND(NFPROTO_IPV4, NF_INET_POST_ROUTING, skb, - NULL, skb_dst(skb)->dev, xfrm4_output_finish, + NULL, dst->dev, + x->outer_mode->afinfo->output_finish, !(IPCB(skb)->flags & IPSKB_REROUTED)); } diff --git a/net/ipv4/xfrm4_state.c b/net/ipv4/xfrm4_state.c index 1717c64628d1..805d63ef4340 100644 --- a/net/ipv4/xfrm4_state.c +++ b/net/ipv4/xfrm4_state.c @@ -78,6 +78,7 @@ static struct xfrm_state_afinfo xfrm4_state_afinfo = { .init_tempsel = __xfrm4_init_tempsel, .init_temprop = xfrm4_init_temprop, .output = xfrm4_output, + .output_finish = xfrm4_output_finish, .extract_input = xfrm4_extract_input, .extract_output = xfrm4_extract_output, .transport_finish = xfrm4_transport_finish, diff --git a/net/ipv6/xfrm6_output.c b/net/ipv6/xfrm6_output.c index 8e688b3de9ab..49a91c5f5623 100644 --- a/net/ipv6/xfrm6_output.c +++ b/net/ipv6/xfrm6_output.c @@ -79,7 +79,7 @@ int xfrm6_prepare_output(struct xfrm_state *x, struct sk_buff *skb) } EXPORT_SYMBOL(xfrm6_prepare_output); -static int xfrm6_output_finish(struct sk_buff *skb) +int xfrm6_output_finish(struct sk_buff *skb) { #ifdef CONFIG_NETFILTER IP6CB(skb)->flags |= IP6SKB_XFRM_TRANSFORMED; @@ -97,9 +97,9 @@ static int __xfrm6_output(struct sk_buff *skb) if ((x && x->props.mode == XFRM_MODE_TUNNEL) && ((skb->len > ip6_skb_dst_mtu(skb) && !skb_is_gso(skb)) || dst_allfrag(skb_dst(skb)))) { - return ip6_fragment(skb, xfrm6_output_finish); + return ip6_fragment(skb, x->outer_mode->afinfo->output_finish); } - return xfrm6_output_finish(skb); + return x->outer_mode->afinfo->output_finish(skb); } int xfrm6_output(struct sk_buff *skb) diff --git a/net/ipv6/xfrm6_state.c b/net/ipv6/xfrm6_state.c index afe941e9415c..248f0b2a7ee9 100644 --- a/net/ipv6/xfrm6_state.c +++ b/net/ipv6/xfrm6_state.c @@ -178,6 +178,7 @@ static struct xfrm_state_afinfo xfrm6_state_afinfo = { .tmpl_sort = __xfrm6_tmpl_sort, .state_sort = __xfrm6_state_sort, .output = xfrm6_output, + .output_finish = xfrm6_output_finish, .extract_input = xfrm6_extract_input, .extract_output = xfrm6_extract_output, .transport_finish = xfrm6_transport_finish, diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 15792d8b6272..b4d745ea8ee1 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1406,6 +1406,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, struct net *net = xp_net(policy); unsigned long now = jiffies; struct net_device *dev; + struct xfrm_mode *inner_mode; struct dst_entry *dst_prev = NULL; struct dst_entry *dst0 = NULL; int i = 0; @@ -1436,6 +1437,17 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, goto put_states; } + if (xfrm[i]->sel.family == AF_UNSPEC) { + inner_mode = xfrm_ip2inner_mode(xfrm[i], + xfrm_af2proto(family)); + if (!inner_mode) { + err = -EAFNOSUPPORT; + dst_release(dst); + goto put_states; + } + } else + inner_mode = xfrm[i]->inner_mode; + if (!dst_prev) dst0 = dst1; else { @@ -1464,7 +1476,7 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, dst1->lastuse = now; dst1->input = dst_discard; - dst1->output = xfrm[i]->outer_mode->afinfo->output; + dst1->output = inner_mode->afinfo->output; dst1->next = dst_prev; dst_prev = dst1; From 6fa5ddcc675b937f94d05628e8997c07a80c6cb9 Mon Sep 17 00:00:00 2001 From: Steffen Klassert Date: Mon, 9 May 2011 19:43:05 +0000 Subject: [PATCH 25/27] xfrm: Don't allow esn with disabled anti replay detection Unlike the standard case, disabled anti replay detection needs some nontrivial extra treatment on ESN. RFC 4303 states: Note: If a receiver chooses to not enable anti-replay for an SA, then the receiver SHOULD NOT negotiate ESN in an SA management protocol. Use of ESN creates a need for the receiver to manage the anti-replay window (in order to determine the correct value for the high-order bits of the ESN, which are employed in the ICV computation), which is generally contrary to the notion of disabling anti-replay for an SA. So return an error if an ESN state with disabled anti replay detection is inserted for now and add the extra treatment later if we need it. Signed-off-by: Steffen Klassert Signed-off-by: David S. Miller --- net/xfrm/xfrm_replay.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index e8a781422feb..47f1b8638df9 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -535,6 +535,9 @@ int xfrm_init_replay(struct xfrm_state *x) replay_esn->bmp_len * sizeof(__u32) * 8) return -EINVAL; + if ((x->props.flags & XFRM_STATE_ESN) && replay_esn->replay_window == 0) + return -EINVAL; + if ((x->props.flags & XFRM_STATE_ESN) && x->replay_esn) x->repl = &xfrm_replay_esn; else From aae1e743fee2b5523fb31ee050295f062cb26a31 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Mon, 9 May 2011 07:43:20 +0000 Subject: [PATCH 26/27] net/usb: mark LG VL600 LTE modem ethernet interface as WWAN Like other mobile broadband device ethernet interfaces, mark the LG VL600 with the 'wwan' devtype so userspace knows it needs additional configuration via the AT port before the interface can be used. Signed-off-by: Dan Williams Signed-off-by: David S. Miller --- drivers/net/usb/cdc_ether.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/usb/cdc_ether.c b/drivers/net/usb/cdc_ether.c index a301479ecc60..c924ea2bce07 100644 --- a/drivers/net/usb/cdc_ether.c +++ b/drivers/net/usb/cdc_ether.c @@ -567,7 +567,7 @@ static const struct usb_device_id products [] = { { USB_DEVICE_AND_INTERFACE_INFO(0x1004, 0x61aa, USB_CLASS_COMM, USB_CDC_SUBCLASS_ETHERNET, USB_CDC_PROTO_NONE), - .driver_info = 0, + .driver_info = (unsigned long)&wwan_info, }, /* From 0d4420a90b51abdea71585f571bad6d789ff8eb7 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 10 May 2011 13:12:30 -0700 Subject: [PATCH 27/27] slcan: fix ldisc->open retval TTY layer expects 0 if the ldisc->open operation succeeded. Reported-by: Matvejchikov Ilya Signed-off-by: Oliver Hartkopp Signed-off-by: David S. Miller --- drivers/net/can/slcan.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c index b423965a78d1..1b49df6b2470 100644 --- a/drivers/net/can/slcan.c +++ b/drivers/net/can/slcan.c @@ -583,7 +583,9 @@ static int slcan_open(struct tty_struct *tty) /* Done. We have linked the TTY line to a channel. */ rtnl_unlock(); tty->receive_room = 65536; /* We don't flow control */ - return sl->dev->base_addr; + + /* TTY layer expects 0 on success */ + return 0; err_free_chan: sl->tty = NULL;