[Varnish] #516: vsl_mtx "deadlock"; child stops responding

Fri Jun 5 03:58:33 CEST 2009

#516: vsl_mtx "deadlock"; child stops responding
----------------------+-----------------------------------------------------
 Reporter:  kb        |       Owner:  phk
     Type:  defect    |      Status:  new
 Priority:  normal    |   Milestone:     
Component:  varnishd  |     Version:  2.0
 Severity:  normal    |    Keywords:     
----------------------+-----------------------------------------------------
 I'm seeing children that stop responding reliably every day at roughly the
 same time:

 {{{
 Jun  4 07:08:32 statcache0 varnishd[5958]: Child (26929) not responding to
 ping, killing it.
 Jun  4 07:08:36 statcache0 last message repeated 2 times
 Jun  4 07:08:36 statcache0 varnishd[5958]: Child (26929) died signal=3
 (core dumped)
 Jun  4 07:08:36 statcache0 varnishd[5958]: Child cleanup complete
 }}}

 GDB:

 {{{
 (gdb) where
 #0  0x00007f054c4b1174 in __lll_lock_wait () from /lib/libpthread.so.0
 #1  0x00007f054c4acb08 in _L_lock_104 () from /lib/libpthread.so.0
 #2  0x00007f054c4ac470 in pthread_mutex_lock () from /lib/libpthread.so.0
 #3  0x000000000042de2b in VSL (tag=SLT_CLI, id=0, fmt=0x43b502 "Rd %s") at
 shmlog.c:154
 #4  0x0000000000411e95 in cli_vlu (priv=0x7fff553115e0,
 p=0xffffffffffffffff <Address 0xffffffffffffffff out of bounds>) at
 cache_cli.c:105
 #5  0x00007f054ceec472 in LineUpProcess (l=0x7f054bb08370) at vlu.c:156
 #6  0x0000000000411d9c in CLI_Run () at cache_cli.c:165
 #7  0x000000000041a243 in child_main () at cache_main.c:134
 #8  0x0000000000428a0a in start_child (cli=0x0) at mgt_child.c:319
 #9  0x0000000000429212 in mgt_sigchld (e=<value optimized out>,
 what=<value optimized out>) at mgt_child.c:472
 #10 0x00007f054ceeb4ea in vev_sched_signal (evb=0x7f054bb0d040) at
 vev.c:437
 #11 0x00007f054ceebb3d in vev_schedule (evb=0x7f054bb0d040) at vev.c:365
 #12 0x0000000000428cca in mgt_run (dflag=0, T_arg=<value optimized out>)
 at mgt_child.c:560
 #13 0x000000000043228a in main (argc=<value optimized out>,
 argv=0x7fff55311d48) at varnishd.c:655
 }}}

 It's a block on vsl_mtx, and most other threads are blocked too:

 {{{
 (gdb) info thread
   16 process 26930  0x00007f054c4b1e81 in nanosleep () from
 /lib/libpthread.so.0
   15 process 26931  0x00007f054c4aeb99 in pthread_cond_wait@@GLIBC_2.3.2
 () from /lib/libpthread.so.0
   14 process 26933  0x00007f054c4b1174 in __lll_lock_wait () from
 /lib/libpthread.so.0
   13 process 26942  0x00007f054c4b1174 in __lll_lock_wait () from
 /lib/libpthread.so.0
   12 process 26943  0x00007f054bd76c86 in poll () from /lib/libc.so.6
   11 process 8811  0x00007f054c4b1174 in __lll_lock_wait () from
 /lib/libpthread.so.0
   10 process 15459  0x00007f054bd788e3 in writev () from /lib/libc.so.6
   9 process 15676  0x000000000042dadd in WSL_Flush (w=0x44e39be0,
 overflow=<value optimized out>) at shmlog.c:194
   8 process 17612  0x00007f054c4b1174 in __lll_lock_wait () from
 /lib/libpthread.so.0
   7 process 18638  0x00007f054c4b1174 in __lll_lock_wait () from
 /lib/libpthread.so.0
   6 process 19059  0x00007f054c4b1174 in __lll_lock_wait () from
 /lib/libpthread.so.0
   5 process 20041  0x00007f054bd788e3 in writev () from /lib/libc.so.6
   4 process 20226  0x00007f054c4b1174 in __lll_lock_wait () from
 /lib/libpthread.so.0
   3 process 22666  HSH_Lookup (sp=0x7f04ff330008) at cache_hash.c:297
   2 process 1427  0x00007f054bd788e3 in writev () from /lib/libc.so.6
 * 1 process 26929  0x00007f054c4b1174 in __lll_lock_wait () from
 /lib/libpthread.so.0
 }}}

 The blocks are either the LOCKSHM(&vsl_mtx) in VSL() (line 154 in 2.0.4)
 or LOCKSHM(&vsl_mtx) in WSL_Flush() (line 187).

 One thread is always at line 194 of WSL_Flush():

 p[l] = SLT_ENDMARKER;

 p is pretty wierd; 2^64^-1 above and in the thread that terms at line 194:

 {{{
 (gdb) where full
 #0  0x000000000042dadd in WSL_Flush (w=0x44e39be0, overflow=<value
 optimized out>) at shmlog.c:194
         p = (unsigned char *) 0x7f0549fc688c <Address 0x7f0549fc688c out
 of bounds>
         l = 1958
         __func__ = "WSL_Flush"
 #1  0x0000000000410b17 in cnt_done (sp=0x7f04ff32b008) at
 cache_center.c:235
         dh = <value optimized out>
         dp = <value optimized out>
         da = <value optimized out>
         pfd = {{fd = -13455352, events = 32516, revents = 0}}
         i = <value optimized out>
         __func__ = "cnt_done"
 #2  0x0000000000411019 in CNT_Session (sp=0x7f04ff32b008) at steps.h:44
         done = 0
         w = (struct worker *) 0x44e39be0
         __func__ = "CNT_Session"
 #3  0x000000000041cb6d in wrk_do_cnt_sess (w=0x44e39be0, priv=<value
 optimized out>) at cache_pool.c:398
         sess = (struct sess *) 0x7f04ff32b008
         __func__ = "wrk_do_cnt_sess"
 #4  0x000000000041c21b in wrk_thread (priv=0x7f054bb0c0b0) at
 cache_pool.c:310
         ww = {magic = 1670491599, nobjhead = 0x0, nobj = 0x0, lastused =
 1244124456.0244427, cond = {__data = {__lock = 0, __futex = 700852,
 __total_seq = 350426, __wakeup_seq = 350426, __woken_seq = 350426,
       __mutex = 0x7f0546904228, __nwaiters = 0, __broadcast_seq = 0},
     __size =
 "\000\000\000\000??\n\000?X\005\000\000\000\000\000?X\005\000\000\000\000\000?X\005\000\000\000\000\000(B\220F\005\177\000\000\000\000\000\000\000\000\000",
 __align = 3010136419336192}, list = {
     vtqe_next = 0x45e3bbe0, vtqe_prev = 0x7f054bb0c0c0}, wrq =
 0x7f04ff32b198, wfd = 0x0, werr = 0, iov = {{iov_base = 0x7f052d0a6358,
 iov_len = 8}, {iov_base = 0x43cfa8, iov_len = 1}, {
       iov_base = 0x7f052d0a6361, iov_len = 3}, {iov_base = 0x43cfa8,
 iov_len = 1}, {iov_base = 0x7f052d0a6365, iov_len = 2}, {iov_base =
 0x43ea4b, iov_len = 2}, {iov_base = 0x7f052d0a6368, iov_len = 15}, {
       iov_base = 0x43ea4b, iov_len = 2}, {iov_base = 0x7f052d0a6461,
 iov_len = 38}, {iov_base = 0x43ea4b, iov_len = 2}, {iov_base =
 0x7f052d0a6488, iov_len = 34}, {iov_base = 0x43ea4b, iov_len = 2}, {
       iov_base = 0x7f052d0a64ab, iov_len = 44}, {iov_base = 0x43ea4b,
 iov_len = 2}, {iov_base = 0x7f052d0a64d8, iov_len = 17}, {iov_base =
 0x43ea4b, iov_len = 2}, {iov_base = 0x7f052d0a650e, iov_len = 59}, {
       iov_base = 0x43ea4b, iov_len = 2}, {iov_base = 0x7f052d0a654a,
 iov_len = 24}, {iov_base = 0x43ea4b, iov_len = 2}, {iov_base =
 0x7f052d0a6563, iov_len = 17}, {iov_base = 0x43ea4b, iov_len = 2}, {
       iov_base = 0x7f04ff32bf5e, iov_len = 35}, {iov_base = 0x43ea4b,
 iov_len = 2}, {iov_base = 0x7f04ff32bf82, iov_len = 21}, {iov_base =
 0x43ea4b, iov_len = 2}, {iov_base = 0x7f04ff32bf98, iov_len = 6}, {
       iov_base = 0x43ea4b, iov_len = 2}, {iov_base = 0x43d49b, iov_len =
 16}, {iov_base = 0x43ea4b, iov_len = 2}, {iov_base = 0x7f04ff32bf9f,
 iov_len = 22}, {iov_base = 0x43ea4b, iov_len = 2}, {
       iov_base = 0x43ea4b, iov_len = 2}, {iov_base = 0x7f051fd74000,
 iov_len = 185528}, {iov_base = 0x43ea4b, iov_len = 2}, {iov_base =
 0x7f052c21b000, iov_len = 80}, {iov_base = 0x43ea4b, iov_len = 2}, {
       iov_base = 0x43ea4b, iov_len = 2}, {iov_base = 0x7f04ff12e31d,
 iov_len = 22}, {iov_base = 0x43ea4b, iov_len = 2}, {iov_base = 0x43ea4b,
 iov_len = 2}, {iov_base = 0x44e35a00, iov_len = 145}, {
       iov_base = 0x7f04ff376dd8, iov_len = 33}, {iov_base = 0x43ea4b,
 iov_len = 2}, {iov_base = 0x7f04fed092a0, iov_len = 21}, {iov_base =
 0x43ea4b, iov_len = 2}, {iov_base = 0x7f04fed092b6, iov_len = 28}, {
       iov_base = 0x43ea4b, iov_len = 2}, {iov_base = 0x7f04fed0931c,
 iov_len = 22}, {iov_base = 0x43ea4b, iov_len = 2}, {iov_base = 0x43ea4b,
 iov_len = 2}, {iov_base = 0x0, iov_len = 0} <repeats 13 times>},
   niov = 0, liov = 0, vcl = 0x7f053f765328, srcaddr = 0x7f04fe53d2c0, wlb
 = 0x44e37bb0 "\026", wlp = 0x44e38356 "", wle = 0x44e39bb0 "", wlr = 52,
 sha256ctx = 0x44e3a0a0}
         sha256 = {state = {0, 0, 0, 0, 0, 0, 0, 0}, count = 0, buf = '\0'
 <repeats 63 times>}
         __func__ = "wrk_thread"
 #5  0x00007f054c4aa3f7 in start_thread () from /lib/libpthread.so.0
 No symbol table info available.
 #6  0x00007f054bd7fb3d in clone () from /lib/libc.so.6
 No symbol table info available.
 #7  0x0000000000000000 in ?? ()
 No symbol table info available.
 }}}

 Again p is out of bounds.

 Fascinating, note what ''always'' happens right before the FAIL:

 {{{
 Jun  4 07:08:30 statcache0 syslogd 1.5.0#1ubuntu1: restart.
 Jun  4 07:08:32 statcache0 varnishd[5958]: Child (26929) not responding to
 ping, killing it.
 Jun  4 07:08:36 statcache0 last message repeated 2 times
 Jun  4 07:08:36 statcache0 varnishd[5958]: Child (26929) died signal=3
 (core dumped)
 Jun  4 07:08:36 statcache0 varnishd[5958]: Child cleanup complete
 Jun  4 07:08:36 statcache0 varnishd[5958]: child (2535) Started
 Jun  4 07:08:36 statcache0 varnishd[5958]: Child (2535) said Closed fds: 4
 5 6 9 10 12 13
 Jun  4 07:08:36 statcache0 varnishd[5958]: Child (2535) said Child starts
 Jun  4 07:08:36 statcache0 varnishd[5958]: Child (2535) said managed to
 mmap 1073741824 bytes of 1073741824
 Jun  4 07:08:36 statcache0 varnishd[5958]: Child (2535) said Ready
 }}}

 Nothing else from varnishd shows up in the log except the above, so
 there's no spurious log flood.  I'm also not doing any syslog() C tricks
 (yet) so what syslog() or log rotation dependency is there within varnishd
 that could cause this?  Manually running the daily scripts doesn't cause
 this, but I'm trying to find a reproduction case.  Though clearly
 something varnishy is awry.

 Thx,[[BR]]
 -- [[BR]]
 Ken.[[BR]]

-- 
Ticket URL: <http://varnish.projects.linpro.no/ticket/516>
Varnish <http://varnish.projects.linpro.no/>
The Varnish HTTP Accelerator