]>
Commit | Line | Data |
---|---|---|
59d2274f DW |
1 | #!/usr/bin/env python3 |
2 | ||
959ef981 | 3 | # SPDX-License-Identifier: GPL-2.0+ |
59d2274f DW |
4 | # Copyright (C) 2016 Oracle. All Rights Reserved. |
5 | # | |
6 | # Author: Darrick J. Wong <darrick.wong@oracle.com> | |
959ef981 DC |
7 | |
8 | # Read ftrace input, looking for XFS buffer deadlocks. | |
59d2274f DW |
9 | # |
10 | # Rough guide to using this script: | |
11 | # Collect ftrace data from a deadlock: | |
12 | # | |
13 | # # trace-cmd record -e 'xfs_buf_*lock*' <other traces> & | |
14 | # <run command, hang system>^Z | |
15 | # # killall -INT trace-cmd | |
16 | # <wait for trace-cmd to spit out trace.dat> | |
17 | # | |
18 | # Now analyze the captured trace data: | |
19 | # | |
20 | # # trace-cmd report | xfsbuflock.py | |
21 | # === fsx-14956 === | |
22 | # <trace data> | |
23 | # 3732.005575: xfs_buf_trylock_fail: dev 8:16 bno 0x1 nblks 0x1 hold 4 \ | |
24 | # pincount 1 lock 0 flags DONE|KMEM caller 0xc009af36s | |
25 | # Locked buffers: | |
26 | # dev 8:16 bno 0x64c371 nblks 0x1 lock 1 owner fsx-14956@3732.005567 | |
27 | # waiting: fsx-14954 | |
28 | # dev 8:16 bno 0x64c380 nblks 0x8 lock 1 owner fsx-14956@3732.005571 | |
29 | # dev 8:16 bno 0x64c378 nblks 0x8 lock 1 owner fsx-14956@3732.005570 | |
30 | # === fsx-14954 === | |
31 | # <trace data> | |
32 | # 3732.005592: xfs_buf_trylock_fail: dev 8:16 bno 0x64c371 nblks 0x1 hold 4 \ | |
33 | # pincount 1 lock 0 flags ASYNC|DONE|KMEM caller 0xc009af36s | |
34 | # Locked buffers: | |
35 | # dev 8:16 bno 0x8 nblks 0x8 lock 1 owner fsx-14954@3732.005583 | |
36 | # dev 8:16 bno 0x1 nblks 0x1 lock 1 owner fsx-14954@3732.005574 | |
37 | # waiting: fsx-14956 | |
38 | # waiting: fsx-14957 | |
39 | # waiting: fsx-14958 | |
40 | # dev 8:16 bno 0x10 nblks 0x8 lock 1 owner fsx-14954@3732.005585 | |
41 | # | |
42 | # As you can see, fsx-14596 is locking AGFs in violation of the locking | |
43 | # order rules. | |
44 | ||
45 | import sys | |
46 | import fileinput | |
47 | from collections import namedtuple | |
48 | ||
49 | NR_BACKTRACE = 50 | |
50 | ||
51 | class Process: | |
52 | def __init__(self, pid): | |
53 | self.pid = pid; | |
54 | self.bufs = set() | |
55 | self.locked_bufs = set() | |
56 | self.backtrace = [] | |
57 | ||
58 | def dump(self): | |
59 | print('=== %s ===' % self.pid) | |
60 | for bt in self.backtrace: | |
61 | print('%f: %s' % (bt.time, bt.descr)) | |
62 | print('Locked buffers:') | |
63 | for buf in self.locked_bufs: | |
64 | buf.dump() | |
65 | ||
66 | class Buffer: | |
67 | def __init__(self, dev, bno, blen): | |
68 | self.dev = dev | |
69 | self.bno = int(bno, 0) | |
70 | self.blen = int(blen, 0) | |
71 | self.locked = False | |
72 | self.locktime = None | |
73 | self.owner = None | |
74 | self.waiters = set() | |
ece9fa6e | 75 | self.lockline = 0 |
59d2274f DW |
76 | |
77 | def trylock(self, process, time): | |
c2b571f1 DW |
78 | if not self.locked: |
79 | self.lockdone(process, time) | |
59d2274f | 80 | |
184798fd DW |
81 | def init(self, process, time): |
82 | # Buffers are initialized locked, but we could be allocating | |
83 | # a surplus buffer while trying to grab a buffer that may or | |
84 | # may not already exist. | |
85 | if not self.locked: | |
86 | self.lockdone(process, time) | |
87 | ||
59d2274f DW |
88 | def lockdone(self, process, time): |
89 | if self.locked: | |
ece9fa6e DW |
90 | print('Buffer 0x%x already locked at line %d? (line %d)' % \ |
91 | (self.bno, self.lockline, nr)) | |
59d2274f DW |
92 | # process.dump() |
93 | # self.dump() | |
94 | # assert False | |
95 | if process in self.waiters: | |
96 | self.waiters.remove(process) | |
97 | self.locked = True | |
98 | self.owner = process | |
99 | self.locktime = time | |
ece9fa6e | 100 | self.lockline = nr |
59d2274f DW |
101 | process.locked_bufs.add(self) |
102 | process.bufs.add(self) | |
103 | locked_buffers.add(self) | |
104 | ||
105 | def waitlock(self, process): | |
106 | self.waiters.add(process) | |
107 | ||
108 | def unlock(self): | |
109 | self.locked = False | |
110 | if self in locked_buffers: | |
111 | locked_buffers.remove(self) | |
112 | if self.owner is not None and \ | |
113 | self in self.owner.locked_bufs: | |
114 | self.owner.locked_bufs.remove(self) | |
115 | ||
116 | def dump(self): | |
117 | if self.owner is not None: | |
ece9fa6e DW |
118 | pid = '%s@%f (line %d)' % \ |
119 | (self.owner.pid, self.locktime, self.lockline) | |
59d2274f DW |
120 | else: |
121 | pid = '' | |
122 | print('dev %s bno 0x%x nblks 0x%x lock %d owner %s' % \ | |
123 | (self.dev, self.bno, self.blen, self.locked, \ | |
124 | pid)) | |
125 | for proc in self.waiters: | |
126 | print(' waiting: %s' % proc.pid) | |
127 | ||
128 | Event = namedtuple('Event', 'time descr') | |
129 | ||
130 | # Read ftrace input, looking for events and for buffer lock info | |
131 | processes = {} | |
132 | buffers = {} | |
133 | locked_buffers = set() | |
134 | ||
135 | def getbuf(toks): | |
136 | if int(toks[7], 0) == 18446744073709551615: | |
137 | return None | |
138 | bufkey = ' '.join(toks[4:10]) | |
139 | if bufkey in buffers: | |
140 | return buffers[bufkey] | |
141 | buf = Buffer(toks[5], toks[7], toks[9]) | |
142 | buffers[bufkey] = buf | |
143 | return buf | |
144 | ||
145 | nr = 0 | |
146 | for line in fileinput.input(): | |
147 | nr += 1 | |
148 | toks = line.split() | |
149 | if len(toks) < 4: | |
150 | continue | |
151 | pid = toks[0] | |
f231685e DW |
152 | try: |
153 | time = float(toks[2][:-1]) | |
154 | except: | |
155 | continue | |
59d2274f DW |
156 | fn = toks[3][:-1] |
157 | ||
158 | if pid in processes: | |
159 | proc = processes[pid] | |
160 | else: | |
161 | proc = Process(pid) | |
162 | processes[pid] = proc | |
163 | ||
164 | if fn == 'xfs_buf_unlock' or fn == 'xfs_buf_item_unlock_stale': | |
165 | buf = getbuf(toks) | |
166 | if buf is not None: | |
167 | buf.unlock() | |
168 | elif fn == 'xfs_buf_lock_done': | |
169 | buf = getbuf(toks) | |
170 | if buf is not None: | |
171 | buf.lockdone(proc, time) | |
172 | elif fn == 'xfs_buf_lock': | |
173 | buf = getbuf(toks) | |
174 | if buf is not None: | |
175 | buf.waitlock(proc) | |
176 | elif fn == 'xfs_buf_trylock': | |
177 | buf = getbuf(toks) | |
178 | if buf is not None: | |
179 | buf.trylock(proc, time) | |
184798fd DW |
180 | elif fn == 'xfs_buf_init': |
181 | buf = getbuf(toks) | |
182 | if buf is not None: | |
183 | buf.init(proc, time) | |
59d2274f DW |
184 | elif fn == 'xfs_buf_item_unlock': |
185 | pass | |
186 | else: | |
187 | e = Event(time, ' '.join(toks[3:])) | |
188 | proc.backtrace.append(e) | |
189 | if len(proc.backtrace) > NR_BACKTRACE: | |
190 | proc.backtrace.pop(0) | |
191 | ||
192 | deadlocked = set() | |
193 | for buf in locked_buffers: | |
194 | deadlocked.add(buf.owner) | |
195 | ||
196 | for proc in deadlocked: | |
197 | proc.dump() | |
198 | ||
199 | sys.exit(0) | |
200 | ||
201 | for key in buffers: | |
202 | buf = buffers[key] | |
203 | if buf.locked: | |
204 | print('dev %s bno 0x%x len 0x%x owner %s' % (buf.dev, buf.bno, buf.blen, buf.owner.pid)) | |
205 | else: | |
206 | print('dev %s bno 0x%x len 0x%x' % (buf.dev, buf.bno, buf.blen)) | |
207 | ||
208 | sys.exit(0) | |
209 | ||
210 | for pid in processes: | |
211 | proc = processes[pid] |