5

On one of my ubuntu server, top command start very slow, when I run top command on terminal, it will display info after more than 10 seconds.

Then I use strace -yy top to analysis the problem, I found top hang at the following point:

getsockopt(6<UNIX:[366379599]>, SOL_SOCKET, SO_SNDBUF, [212992], [4]) = 0                                                                 [0/1911]
setsockopt(6<UNIX:[366379599]>, SOL_SOCKET, SO_SNDBUFFORCE, [8388608], 4) = 0
connect(6<UNIX:[366379599]>, {sa_family=AF_UNIX, sun_path="/run/dbus/system_bus_socket"}, 29) = 0
getsockopt(6<UNIX:[366379599->366381191]>, SOL_SOCKET, SO_PEERCRED, {pid=1, uid=0, gid=0}, [12]) = 0
getsockopt(6<UNIX:[366379599->366381191]>, SOL_SOCKET, SO_PEERSEC, "unconfined", [64->10]) = 0
getsockopt(6<UNIX:[366379599->366381191]>, SOL_SOCKET, SO_PEERGROUPS, "", [256->0]) = 0
fstat(6<UNIX:[366379599->366381191]>, {st_mode=S_IFSOCK|0777, st_size=0, ...}) = 0
getsockopt(6<UNIX:[366379599->366381191]>, SOL_SOCKET, SO_ACCEPTCONN, [0], [4]) = 0
getsockname(6<UNIX:[366379599->366381191]>, {sa_family=AF_UNIX}, [128->2]) = 0
geteuid()                               = 0
sendmsg(6<UNIX:[366379599->366381191]>, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="\0AUTH EXTERNAL ", iov_len=15}, {iov_base="30", iov_len
=2}, {iov_base="\r\nNEGOTIATE_UNIX_FD\r\nBEGIN\r\n", iov_len=28}], msg_iovlen=3, msg_controllen=0, msg_flags=0}, MSG_DONTWAIT|MSG_NOSIGNAL) = 45
gettid()                                = 25127
getrandom("\x00\x3c\x86\xd4\x34\xa0\xb2\x91\xee\x2d\x4b\x2c\x1b\x56\xda\xa8", 16, GRND_NONBLOCK) = 16
futex(0x7f00826d9ff0, FUTEX_WAKE_PRIVATE, 2147483647) = 0
recvmsg(6<UNIX:[366379599->366381191]>, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="OK 141ec8c80f29bf83dc576f5e61641"..., iov_len=256}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_DONTWAIT|MSG_NOSIGNAL|MSG_CMSG_CLOEXEC) = 52
sendmsg(6<UNIX:[366379599->366381191]>, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="l\1\0\1\0\0\0\0\1\0\0\0m\0\0\0\1\1o\0\25\0\0\0/org/fre"
..., iov_len=128}], msg_iovlen=1, msg_controllen=0, msg_flags=0}, MSG_DONTWAIT|MSG_NOSIGNAL) = 128
recvmsg(6<UNIX:[366379599->366381191]>, {msg_namelen=0}, MSG_DONTWAIT|MSG_NOSIGNAL|MSG_CMSG_CLOEXEC) = -1 EAGAIN (Resource temporarily unavailable
)
ppoll([{fd=6<UNIX:[366379599->366381191]>, events=POLLIN}], 1, {tv_sec=24, tv_nsec=999897000}, NULL, 8) = 1 ([{fd=6, revents=POLLIN}], left {tv_sec=24, tv_nsec=999818992})
recvmsg(6<UNIX:[366379599->366381191]>, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="l\2\1\1\17\0\0\0\1\0\0\0E\0\0\0\6\1s\0\n\0\0\0", iov_le
n=24}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_DONTWAIT|MSG_NOSIGNAL|MSG_CMSG_CLOEXEC) = 24
recvmsg(6<UNIX:[366379599->366381191]>, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base=":1.2843938\0\0\0\0\0\0\5\1u\0\1\0\0\0\10\1g\0\1s\0\0"..
., iov_len=79}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_DONTWAIT|MSG_NOSIGNAL|MSG_CMSG_CLOEXEC) = 79
sendmsg(6<UNIX:[366379599->366381191]>, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="l\1\0\1\4\0\0\0\2\0\0\0\247\0\0\0\1\1o\0\31\0\0\0/org/f
re"..., iov_len=184}, {iov_base="w\364\0\0", iov_len=4}], msg_iovlen=2, msg_controllen=0, msg_flags=0}, MSG_DONTWAIT|MSG_NOSIGNAL) = 188
recvmsg(6<UNIX:[366379599->366381191]>, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="l\4\1\1\17\0\0\0\2\0\0\0\225\0\0\0\1\1o\0\25\0\0\0", io
v_len=24}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_DONTWAIT|MSG_NOSIGNAL|MSG_CMSG_CLOEXEC) = 24
recvmsg(6<UNIX:[366379599->366381191]>, {msg_name=NULL, msg_namelen=0, msg_iov=[{iov_base="/org/freedesktop/DBus\0\0\0\2\1s\0\24\0\0\0"..., iov_le
n=159}], msg_iovlen=1, msg_controllen=0, msg_flags=MSG_CMSG_CLOEXEC}, MSG_DONTWAIT|MSG_NOSIGNAL|MSG_CMSG_CLOEXEC) = 159
recvmsg(6<UNIX:[366379599->366381191]>, {msg_namelen=0}, MSG_DONTWAIT|MSG_NOSIGNAL|MSG_CMSG_CLOEXEC) = -1 EAGAIN (Resource temporarily unavailable
)
ppoll([{fd=6<UNIX:[366379599->366381191]>, events=POLLIN}], 1, {tv_sec=24, tv_nsec=999853000}, NULL, 8

And the command ls -l /proc/$(pgrep top)/fd/ get the following output:

lrwx------ 1 root root 64 1月  25 18:11 0 -> /dev/pts/9
lrwx------ 1 root root 64 1月  25 18:11 1 -> /dev/pts/9
l-wx------ 1 root root 64 1月  25 18:11 2 -> /dev/null
lrwx------ 1 root root 64 1月  25 18:11 3 -> /dev/pts/9
lr-x------ 1 root root 64 1月  25 18:11 4 -> /proc/uptime
lr-x------ 1 root root 64 1月  25 18:11 5 -> /proc
lrwx------ 1 root root 64 1月  25 18:11 6 -> 'socket:[364876425]'

lsof -d 6 -U -a +E -p $(pgrep top) get following output:

COMMAND     PID       USER   FD   TYPE             DEVICE SIZE/OFF      NODE NAME
dbus-daem   874 messagebus   12u  unix 0xffff9545f6fee400      0t0 366381191 /var/run/dbus/system_bus_socket type=STREAM ->INO=366379599 25127,top,6u
top       25127       root    6u  unix 0xffff9545f6fefc00      0t0 366379599 type=STREAM ->INO=366381191 874,dbus-daem,12u

After top works as normal, the command ls -l /proc/$(pgrep top)/fd/ get the following output:

lrwx------ 1 root root 64 1月  25 18:11 0 -> /dev/pts/9
lrwx------ 1 root root 64 1月  25 18:11 1 -> /dev/pts/9
l-wx------ 1 root root 64 1月  25 18:11 2 -> /dev/null
lrwx------ 1 root root 64 1月  25 18:11 3 -> /dev/pts/9
lr-x------ 1 root root 64 1月  25 18:11 4 -> /proc/uptime
lr-x------ 1 root root 64 1月  25 18:11 5 -> /proc/meminfo
lr-x------ 1 root root 64 1月  25 18:11 6 -> /proc/loadavg
lr-x------ 1 root root 64 1月  25 18:12 7 -> /proc/stat

lsof -d 6 -U -a +E -p $(pgrep top) output nothing.

I want to know the reason that why top command start so slow, can anyone help me?

Ren
  • 273
  • If you saved strace output to a file, check what fd 6 is. You can also add -yy to strace, it will show you the socket details. And next time you can run lsof -p to see where the socket is connected to. – aviro Jan 25 '22 at 09:03
  • That's still not enough information from strace. You should at least show the socket and connect system calls that created the socket. Secondly, you can run lsof -U -a +E -p $(pgrep top) to see the unix socket information and what's the end points of the socket. Or even, if the stuck file descriptor is always 6, you can add -d6 to your lsof command to see only the socket information and the endpoint for this file descriptor. – aviro Jan 25 '22 at 12:22
  • @aviro I have updated my question to show more information. – Ren Jan 26 '22 at 02:54

1 Answers1

0

When I look at my own strace, I see that the only recvmsg and poll happen when top is trying to communicate with the socket file of nscd (/var/run/nscd/socket).

socket(PF_LOCAL, SOCK_STREAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0) = 10
connect(10<UNIX:[355032774]>, {sa_family=AF_LOCAL, sun_path="/var/run/nscd/socket"}, 110) = 0
sendto(10<UNIX:[355032774->355027846]>, "\2\0\0\0\v\0\0\0\7\0\0\0passwd\0", 19, MSG_NOSIGNAL, NULL, 0) = 19
poll([{fd=10<UNIX:[355032774]>, events=POLLIN|POLLERR|POLLHUP}], 1, 5000) = 1 ([{fd=10, revents=POLLIN|POLLHUP}])
recvmsg(10<UNIX:[355032774]>, {msg_name(0)=NULL, msg_iov(2)=[{"passwd\0", 7}, {"\264K\5\0\0\0\0\0", 8}], msg_controllen=20, [{cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, [11</run/nscd/passwd>]}], msg_flags=MSG_CMSG_CLOEXEC}, MSG_CMSG_CLOEXEC) = 15

If that's what you also see, you may have a problem with nscd. top tries to get passwd information from the nscd cache, and the process might be hung. Please try to stop nscd (service nscd stop) and see if it solves the problem. If it does solve the problem with top, try to restart nscd again (service nscd start) and check if after restart nscd works properly.

aviro
  • 5,532
  • There is no nscd service on my ubuntu server. I have updated my question to show more strace info. – Ren Jan 25 '22 at 12:04