[PATCH] better RPC error counters

Chuck Lever (cel@citi.umich.edu)
Tue, 26 Nov 2002 15:03:24 -0500 (EST)


Description:
add new error counters to the RPC client to help sysadmins identify
problems with their NFS clients. this patch adds counting of:

+ TCP socket connection attempts, reconnects, and connection errors
+ garbage RPC replies
+ partial TCP writes (these indicate socket output buffer shortages)
+ RPC replies with unrecognized XIDs (due to client races or server bugs)
+ RPC replies with bad RPC verifiers (due to request corruption)
+ UDP send errors (due to problems in the network layer)
+ RPC client memory shortages

the new counters appear in /proc/net/rpc/nfs on the "rpc" line. the
extra values in this file do not affect the operation of commands like
"nfsstat."

Apply Against:
2.5.49

Test status:
Compiles, links, boots. Counters appear in /proc/net/rpc/nfs.

diff -Naur 06-timeout-check/include/linux/sunrpc/stats.h 07-counters/include/linux/sunrpc/stats.h
--- 06-timeout-check/include/linux/sunrpc/stats.h Fri Nov 22 16:40:42 2002
+++ 07-counters/include/linux/sunrpc/stats.h Mon Nov 25 13:30:58 2002
@@ -1,7 +1,7 @@
/*
* linux/include/linux/sunrpc/stats.h
*
- * Client statistics collection for SUN RPC
+ * Statistics collection for SUN RPC
*
* Copyright (C) 1996 Olaf Kirch <okir@monad.swb.de>
*/
@@ -11,33 +11,40 @@

#include <linux/config.h>
#include <linux/proc_fs.h>
+#include <linux/cache.h>

struct rpc_stat {
struct rpc_program * program;

- unsigned int netcnt,
+ unsigned long netcnt,
netudpcnt,
nettcpcnt,
nettcpconn,
netreconn;
- unsigned int rpccnt,
+ unsigned long rpccnt,
rpcretrans,
rpcauthrefresh,
- rpcgarbage;
-};
+ rpcgarbage,
+ rpcnospace,
+ rpcbadxids,
+ rpcbadverfs,
+ rpccantconn,
+ rpcnomem,
+ rpccantsend;
+} ____cacheline_aligned;

struct svc_stat {
struct svc_program * program;

- unsigned int netcnt,
+ unsigned long netcnt,
netudpcnt,
nettcpcnt,
nettcpconn;
- unsigned int rpccnt,
+ unsigned long rpccnt,
rpcbadfmt,
rpcbadauth,
rpcbadclnt;
-};
+} ____cacheline_aligned;

void rpc_proc_init(void);
void rpc_proc_exit(void);
diff -Naur 06-timeout-check/include/linux/sunrpc/xprt.h 07-counters/include/linux/sunrpc/xprt.h
--- 06-timeout-check/include/linux/sunrpc/xprt.h Mon Nov 25 13:28:45 2002
+++ 07-counters/include/linux/sunrpc/xprt.h Mon Nov 25 13:30:58 2002
@@ -153,6 +153,7 @@
unsigned long sockstate; /* Socket state */
unsigned char shutdown : 1, /* being shut down */
nocong : 1, /* no congestion control */
+ firstconn : 1, /* no connection attempt yet */
resvport : 1, /* use a reserved port */
stream : 1; /* TCP */

diff -Naur 06-timeout-check/net/sunrpc/clnt.c 07-counters/net/sunrpc/clnt.c
--- 06-timeout-check/net/sunrpc/clnt.c Mon Nov 25 13:26:36 2002
+++ 07-counters/net/sunrpc/clnt.c Mon Nov 25 13:30:58 2002
@@ -445,6 +445,8 @@
xprt_release(task);
}

+ task->tk_client->cl_stats->rpcnomem++;
+
switch (status) {
case -EAGAIN: /* woken up; retry */
task->tk_action = call_reserve;
@@ -485,6 +487,7 @@
if (RPC_IS_ASYNC(task) || !(task->tk_client->cl_intr && signalled())) {
xprt_release(task);
task->tk_action = call_reserve;
+ task->tk_client->cl_stats->rpcnomem++;
rpc_delay(task, HZ>>4);
return;
}
@@ -661,6 +664,7 @@
req->rq_bytes_sent = 0;
break;
case -EAGAIN:
+ clnt->cl_stats->rpcnospace++;
task->tk_action = call_transmit;
break;
case -EIO:
@@ -766,14 +770,16 @@
}

/* Verify the RPC header */
- if (!(p = call_verify(task)))
+ if (!(p = call_verify(task))) {
+ clnt->cl_stats->rpcbadverfs++;
return;
+ }

/*
* The following is an NFS-specific hack to cater for setuid
* processes whose uid is mapped to nobody on the server.
*/
- if (task->tk_client->cl_droppriv &&
+ if (clnt->cl_droppriv &&
(ntohl(*p) == NFSERR_ACCES || ntohl(*p) == NFSERR_PERM)) {
if (RPC_IS_SETUID(task) && task->tk_suid_retry) {
dprintk("RPC: %4d retry squashed uid\n", task->tk_pid);
diff -Naur 06-timeout-check/net/sunrpc/stats.c 07-counters/net/sunrpc/stats.c
--- 06-timeout-check/net/sunrpc/stats.c Fri Nov 22 16:41:13 2002
+++ 07-counters/net/sunrpc/stats.c Mon Nov 25 13:30:58 2002
@@ -40,16 +40,23 @@
int len, i, j;

len = sprintf(buffer,
- "net %d %d %d %d\n",
+ "net %lu %lu %lu %lu\n",
statp->netcnt,
statp->netudpcnt,
statp->nettcpcnt,
statp->nettcpconn);
len += sprintf(buffer + len,
- "rpc %d %d %d\n",
+ "rpc %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
statp->rpccnt,
statp->rpcretrans,
- statp->rpcauthrefresh);
+ statp->rpcauthrefresh,
+ statp->rpcgarbage,
+ statp->rpcnospace,
+ statp->rpcbadxids,
+ statp->rpcbadverfs,
+ statp->rpccantconn,
+ statp->rpcnomem,
+ statp->rpccantsend);

for (i = 0; i < prog->nrvers; i++) {
if (!(vers = prog->version[i]))
@@ -88,13 +95,13 @@
int len, i, j;

len = sprintf(buffer,
- "net %d %d %d %d\n",
+ "net %lu %lu %lu %lu\n",
statp->netcnt,
statp->netudpcnt,
statp->nettcpcnt,
statp->nettcpconn);
len += sprintf(buffer + len,
- "rpc %d %d %d %d %d\n",
+ "rpc %lu %lu %lu %lu %lu\n",
statp->rpccnt,
statp->rpcbadfmt+statp->rpcbadauth+statp->rpcbadclnt,
statp->rpcbadfmt,
diff -Naur 06-timeout-check/net/sunrpc/xprt.c 07-counters/net/sunrpc/xprt.c
--- 06-timeout-check/net/sunrpc/xprt.c Mon Nov 25 13:28:45 2002
+++ 07-counters/net/sunrpc/xprt.c Mon Nov 25 13:30:58 2002
@@ -212,6 +212,7 @@
static inline int
xprt_sendmsg(struct rpc_xprt *xprt, struct rpc_rqst *req)
{
+ struct rpc_stat *stats = req->rq_task->tk_client->cl_stats;
struct socket *sock = xprt->sock;
struct msghdr msg;
struct xdr_buf *xdr = &req->rq_snd_buf;
@@ -223,6 +224,12 @@
if (!sock)
return -ENOTCONN;

+ stats->netcnt++;
+ if (xprt->stream)
+ stats->nettcpcnt++;
+ else
+ stats->netudpcnt++;
+
xprt_pktdump("packet data:",
req->rq_svec->iov_base,
req->rq_svec->iov_len);
@@ -257,16 +264,21 @@
/* When the server has died, an ICMP port unreachable message
* prompts ECONNREFUSED.
*/
+ stats->rpccantsend++;
+ break;
case -EAGAIN:
break;
case -ENOTCONN:
case -EPIPE:
/* connection broken */
+ stats->rpccantsend++;
if (xprt->stream)
result = -ENOTCONN;
break;
default:
+ stats->rpccantsend++;
printk(KERN_NOTICE "RPC: sendmsg returned error %d\n", -result);
+ break;
}
return result;
}
@@ -405,6 +417,7 @@
void
xprt_connect(struct rpc_task *task)
{
+ struct rpc_stat *stats = task->tk_client->cl_stats;
struct rpc_xprt *xprt = task->tk_xprt;
struct socket *sock = xprt->sock;
struct sock *inet;
@@ -476,6 +489,11 @@
if (inet->state != TCP_ESTABLISHED) {
xprt_close(xprt);
task->tk_status = -EAGAIN;
+ stats->nettcpconn++;
+ if (!xprt->firstconn) {
+ stats->netreconn++;
+ xprt->firstconn = 0;
+ }
break;
}

@@ -486,11 +504,13 @@
case -EPIPE:
xprt_close(xprt);
task->tk_status = -ENOTCONN;
+ stats->rpccantconn++;
break;

default:
/* Report myriad other possible returns, and exit. */
xprt_close(xprt);
+ stats->rpccantconn++;
printk(KERN_WARNING
"RPC: error %d connecting to server %s, exiting\n",
-status, task->tk_client->cl_server);
@@ -507,12 +527,18 @@
static void
xprt_conn_status(struct rpc_task *task)
{
+ struct rpc_stat *stats = task->tk_client->cl_stats;
struct rpc_xprt *xprt = task->tk_xprt;

switch (task->tk_status) {
case 0:
dprintk("RPC: %4d xprt_conn_status: connection established\n",
task->tk_pid);
+ stats->nettcpconn++;
+ if (!xprt->firstconn) {
+ stats->netreconn++;
+ xprt->firstconn = 0;
+ }
xprt_release_write(xprt, task);
return;
case -ETIMEDOUT:
@@ -544,6 +570,8 @@
* requests from retrying the connect */
if (task->tk_status != -EAGAIN)
xprt_release_write(xprt, task);
+ else
+ stats->rpccantconn++;
}

/*
@@ -1367,6 +1395,7 @@

xprt->stream = 1;
xprt->nocong = 1;
+ xprt->firstconn = 1;
xprt->cwnd = RPC_MAXCWND;

/* Set timeout parameters */

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/