Re: Longstanding networking / SMP issue? (duplextest)

David S. Miller (davem@redhat.com)
Sun, 23 Feb 2003 01:12:17 -0800 (PST)


From: Andi Kleen <ak@suse.de>
Date: Fri, 21 Feb 2003 11:45:41 +0100

at least ipip.c will send icmps from process context (ipip_tunnel_xmit)

netfilter will probably do too.

We can easily make them all run in BH (protected) context :-)

This makes the locking much simpler. Add to this the per-cpu socket
idea I proposed and we arrive at the patch below (against 2.5.x,
easily ported to 2.4.x just remove the cpu_possible() stuff).

Comments?

--- net/ipv4/icmp.c.~1~ Sat Feb 22 23:41:53 2003
+++ net/ipv4/icmp.c Sun Feb 23 01:25:43 2003
@@ -223,57 +223,29 @@
static struct icmp_control icmp_pointers[NR_ICMP_TYPES+1];

/*
- * The ICMP socket. This is the most convenient way to flow control
+ * The ICMP socket(s). This is the most convenient way to flow control
* our ICMP output as well as maintain a clean interface throughout
* all layers. All Socketless IP sends will soon be gone.
+ *
+ * On SMP we have one ICMP socket per-cpu.
*/
-struct socket *icmp_socket;
-
-/* ICMPv4 socket is only a bit non-reenterable (unlike ICMPv6,
- which is strongly non-reenterable). A bit later it will be made
- reenterable and the lock may be removed then.
- */
-
-static int icmp_xmit_holder = -1;
-
-static int icmp_xmit_lock_bh(void)
-{
- int rc;
- if (!spin_trylock(&icmp_socket->sk->lock.slock)) {
- rc = -EAGAIN;
- if (icmp_xmit_holder == smp_processor_id())
- goto out;
- spin_lock(&icmp_socket->sk->lock.slock);
- }
- rc = 0;
- icmp_xmit_holder = smp_processor_id();
-out:
- return rc;
-}
+static struct socket *__icmp_socket[NR_CPUS];
+#define icmp_socket __icmp_socket[smp_processor_id()]

-static __inline__ int icmp_xmit_lock(void)
+static __inline__ void icmp_xmit_lock(void)
{
- int ret;
local_bh_disable();
- ret = icmp_xmit_lock_bh();
- if (ret)
- local_bh_enable();
- return ret;
-}

-static void icmp_xmit_unlock_bh(void)
-{
- icmp_xmit_holder = -1;
- spin_unlock(&icmp_socket->sk->lock.slock);
+ if (!spin_trylock(&icmp_socket->sk->lock.slock))
+ BUG();
}

-static __inline__ void icmp_xmit_unlock(void)
+static void icmp_xmit_unlock(void)
{
- icmp_xmit_unlock_bh();
+ spin_unlock(&icmp_socket->sk->lock.slock);
local_bh_enable();
}

-
/*
* Send an ICMP frame.
*/
@@ -404,10 +376,11 @@
struct rtable *rt = (struct rtable *)skb->dst;
u32 daddr;

- if (ip_options_echo(&icmp_param->replyopts, skb) ||
- icmp_xmit_lock_bh())
+ if (ip_options_echo(&icmp_param->replyopts, skb))
goto out;

+ icmp_xmit_lock();
+
icmp_param->data.icmph.checksum = 0;
icmp_out_count(icmp_param->data.icmph.type);

@@ -434,7 +407,7 @@
icmp_push_reply(icmp_param, &ipc, rt);
ip_rt_put(rt);
out_unlock:
- icmp_xmit_unlock_bh();
+ icmp_xmit_unlock();
out:;
}

@@ -519,8 +492,7 @@
}
}

- if (icmp_xmit_lock())
- goto out;
+ icmp_xmit_lock();

/*
* Construct source address and options.
@@ -1141,19 +1113,30 @@
void __init icmp_init(struct net_proto_family *ops)
{
struct inet_opt *inet;
- int err = sock_create(PF_INET, SOCK_RAW, IPPROTO_ICMP, &icmp_socket);
+ int i;

- if (err < 0)
- panic("Failed to create the ICMP control socket.\n");
- icmp_socket->sk->allocation = GFP_ATOMIC;
- icmp_socket->sk->sndbuf = SK_WMEM_MAX * 2;
- inet = inet_sk(icmp_socket->sk);
- inet->ttl = MAXTTL;
- inet->pmtudisc = IP_PMTUDISC_DONT;
-
- /* Unhash it so that IP input processing does not even
- * see it, we do not wish this socket to see incoming
- * packets.
- */
- icmp_socket->sk->prot->unhash(icmp_socket->sk);
+ for (i = 0; i < NR_CPUS; i++) {
+ int err;
+
+ if (!cpu_possible(i))
+ continue;
+
+ err = sock_create(PF_INET, SOCK_RAW, IPPROTO_ICMP,
+ &__icmp_socket[i]);
+
+ if (err < 0)
+ panic("Failed to create the ICMP control socket.\n");
+
+ __icmp_socket[i]->sk->allocation = GFP_ATOMIC;
+ __icmp_socket[i]->sk->sndbuf = SK_WMEM_MAX * 2;
+ inet = inet_sk(__icmp_socket[i]->sk);
+ inet->ttl = MAXTTL;
+ inet->pmtudisc = IP_PMTUDISC_DONT;
+
+ /* Unhash it so that IP input processing does not even
+ * see it, we do not wish this socket to see incoming
+ * packets.
+ */
+ __icmp_socket[i]->sk->prot->unhash(__icmp_socket[i]->sk);
+ }
}
--- net/ipv6/icmp.c.~1~ Sat Feb 22 23:48:49 2003
+++ net/ipv6/icmp.c Sun Feb 23 01:27:09 2003
@@ -67,10 +67,11 @@
DEFINE_SNMP_STAT(struct icmpv6_mib, icmpv6_statistics);

/*
- * ICMP socket for flow control.
+ * ICMP socket(s) for flow control.
*/

-struct socket *icmpv6_socket;
+static struct socket *__icmpv6_socket[NR_CPUS];
+#define icmpv6_socket __icmpv6_socket[smp_processor_id()]

static int icmpv6_rcv(struct sk_buff *skb);

@@ -87,39 +88,16 @@
__u32 csum;
};

-
-static int icmpv6_xmit_holder = -1;
-
-static int icmpv6_xmit_lock_bh(void)
+static __inline__ void icmpv6_xmit_lock(void)
{
- if (!spin_trylock(&icmpv6_socket->sk->lock.slock)) {
- if (icmpv6_xmit_holder == smp_processor_id())
- return -EAGAIN;
- spin_lock(&icmpv6_socket->sk->lock.slock);
- }
- icmpv6_xmit_holder = smp_processor_id();
- return 0;
-}
-
-static __inline__ int icmpv6_xmit_lock(void)
-{
- int ret;
local_bh_disable();
- ret = icmpv6_xmit_lock_bh();
- if (ret)
- local_bh_enable();
- return ret;
-}
-
-static void icmpv6_xmit_unlock_bh(void)
-{
- icmpv6_xmit_holder = -1;
- spin_unlock(&icmpv6_socket->sk->lock.slock);
+ if (!spin_trylock(&icmpv6_socket->sk->lock.slock))
+ BUG();
}

static __inline__ void icmpv6_xmit_unlock(void)
{
- icmpv6_xmit_unlock_bh();
+ spin_unlock(&icmpv6_socket->sk->lock.slock);
local_bh_enable();
}

@@ -341,8 +319,7 @@
fl.uli_u.icmpt.type = type;
fl.uli_u.icmpt.code = code;

- if (icmpv6_xmit_lock())
- return;
+ icmpv6_xmit_lock();

if (!icmpv6_xrlim_allow(sk, type, &fl))
goto out;
@@ -415,15 +392,14 @@
fl.uli_u.icmpt.type = ICMPV6_ECHO_REPLY;
fl.uli_u.icmpt.code = 0;

- if (icmpv6_xmit_lock_bh())
- return;
+ icmpv6_xmit_lock();

ip6_build_xmit(sk, icmpv6_getfrag, &msg, &fl, msg.len, NULL, -1,
MSG_DONTWAIT);
ICMP6_INC_STATS_BH(Icmp6OutEchoReplies);
ICMP6_INC_STATS_BH(Icmp6OutMsgs);

- icmpv6_xmit_unlock_bh();
+ icmpv6_xmit_unlock();
}

static void icmpv6_notify(struct sk_buff *skb, int type, int code, u32 info)
@@ -626,26 +602,47 @@
int __init icmpv6_init(struct net_proto_family *ops)
{
struct sock *sk;
- int err;
+ int i;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ int err;
+
+ if (!cpu_possible(i))
+ continue;

- err = sock_create(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6, &icmpv6_socket);
- if (err < 0) {
- printk(KERN_ERR
- "Failed to initialize the ICMP6 control socket (err %d).\n",
- err);
- icmpv6_socket = NULL; /* for safety */
- return err;
+ err = sock_create(PF_INET6, SOCK_RAW, IPPROTO_ICMPV6,
+ &__icmpv6_socket[i]);
+ if (err < 0) {
+ int j;
+
+ printk(KERN_ERR
+ "Failed to initialize the ICMP6 control socket "
+ "(err %d).\n",
+ err);
+ for (j = 0; j < i; j++) {
+ if (!cpu_possible(j))
+ continue;
+ sock_release(__icmpv6_socket[j]);
+ __icmpv6_socket[j] = NULL; /* for safety */
+ }
+ return err;
+ }
+
+ sk = __icmpv6_socket[i]->sk;
+ sk->allocation = GFP_ATOMIC;
+ sk->sndbuf = SK_WMEM_MAX*2;
+ sk->prot->unhash(sk);
}

- sk = icmpv6_socket->sk;
- sk->allocation = GFP_ATOMIC;
- sk->sndbuf = SK_WMEM_MAX*2;
- sk->prot->unhash(sk);

if (inet6_add_protocol(&icmpv6_protocol, IPPROTO_ICMPV6) < 0) {
printk(KERN_ERR "Failed to register ICMP6 protocol\n");
- sock_release(icmpv6_socket);
- icmpv6_socket = NULL;
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!cpu_possible(i))
+ continue;
+ sock_release(__icmpv6_socket[i]);
+ __icmpv6_socket[i] = NULL;
+ }
return -EAGAIN;
}

@@ -654,8 +651,14 @@

void icmpv6_cleanup(void)
{
- sock_release(icmpv6_socket);
- icmpv6_socket = NULL; /* For safety. */
+ int i;
+
+ for (i = 0; i < NR_CPUS; i++) {
+ if (!cpu_possible(i))
+ continue;
+ sock_release(__icmpv6_socket[i]);
+ __icmpv6_socket[i] = NULL; /* For safety. */
+ }
inet6_del_protocol(&icmpv6_protocol, IPPROTO_ICMPV6);
}

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/