Skip to content

Commit ad4cb4e

Browse files
committed
bt: finish the bt module and fix Zombie unwind
The bt module is not very user-friendly at this point. Its only function is to backtrace every task on the system, which is usually too much data. This change adapts it to be a quite user-friendly module with arguments similar to the crash "bt" command, with some additions: * bt [PID | TASK] * bt -c CPU * bt -a * bt -A # the equivalent of crash: foreach bt * bt -s D # backtrace each task in a given state Of course, not every crash option is supported, or should be supported. With these changes, the default corelens report needs to be updated. Previously, we ran the equivalent of "foreach bt", which was a bit too verbose. Let's scale this back to "bt -a" (all on-CPU tasks) and "bt -s D" (all D-state tasks). For verbose reports, we can run "bt -a" and then "bt -A". Signed-off-by: Stephen Brennan <stephen.s.brennan@oracle.com>
1 parent daf7623 commit ad4cb4e

File tree

1 file changed

+122
-26
lines changed

1 file changed

+122
-26
lines changed

drgn_tools/bt.py

Lines changed: 122 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,15 @@
55

66
import drgn
77
from drgn import FaultError
8+
from drgn import Object
89
from drgn import Program
910
from drgn import ProgramFlags
1011
from drgn import TypeKind
1112
from drgn.helpers.common.format import escape_ascii_string
1213
from drgn.helpers.linux.cpumask import for_each_online_cpu
1314
from drgn.helpers.linux.pid import for_each_task
1415
from drgn.helpers.linux.sched import cpu_curr
16+
from drgn.helpers.linux.sched import task_state_to_char
1517

1618
from drgn_tools.corelens import CorelensModule
1719
from drgn_tools.mm import AddrKind
@@ -242,8 +244,15 @@ def print_task_header(task: drgn.Object) -> None:
242244
cpu = task_cpu(task)
243245
taskp = task.value_()
244246
pid = task.pid.value_()
245-
comm = task.comm.string_().decode()
246-
print(f'PID: {pid:<7d} TASK: {taskp:x} CPU: {cpu} COMMAND: "{comm}"')
247+
comm = escape_ascii_string(task.comm.string_())
248+
st = task_state_to_char(task)
249+
cpu_note = ""
250+
if cpu_curr(task.prog_, cpu) == task:
251+
cpu_note = "!"
252+
print(
253+
f"PID: {pid:<7d} TASK: {taskp:x} [{st}] CPU: {cpu}{cpu_note}"
254+
f' COMMAND: "{comm}"'
255+
)
247256

248257

249258
def print_frames(
@@ -463,42 +472,129 @@ def bt_has(
463472
return frame_list
464473

465474

466-
def print_all_bt(prog: Program) -> None:
475+
def print_online_bt(prog: Program, **kwargs: t.Any) -> None:
467476
"""
468-
Prints the stack trace of all tasks
469-
"""
470-
online_tasks = set()
477+
Prints the stack trace of all on-CPU tasks
471478
472-
if not prog.flags & ProgramFlags.IS_LIVE:
473-
print("On-CPU Tasks:")
479+
:kwargs: passed to bt() to control backtrace format
480+
"""
481+
for cpu in for_each_online_cpu(prog):
482+
bt(prog, cpu=cpu, **kwargs)
483+
print()
474484

475-
for cpu in for_each_online_cpu(prog):
476-
task = cpu_curr(prog, cpu)
477-
online_tasks.add(task.pid.value_())
478-
bt(task)
479-
print()
480485

481-
print("\nOff-CPU Tasks:")
486+
def print_all_bt(
487+
prog: Program, states: t.Optional[t.Container[str]] = None, **kwargs: t.Any
488+
) -> None:
489+
"""
490+
Prints the stack trace of all tasks
482491
492+
:param states: when provided (and non-empty), filter the output to only
493+
contain tasks in the given states (like the single-character codes
494+
reported by ps(1)).
495+
:param kwargs: passed to bt() to control backtrace format
496+
"""
483497
for task in for_each_task(prog):
484-
if task.pid.value_() not in online_tasks:
485-
try:
486-
bt(task)
487-
except ValueError as e:
488-
# Catch ValueError for unwinding running tasks on live
489-
# systems. Print the task & comm but don't unwind.
490-
pid = task.pid.value_()
491-
comm = escape_ascii_string(task.comm.string_())
492-
print(f"PID: {pid} COMM: {comm}\nerror: {str(e)}")
493-
print()
498+
st = task_state_to_char(task)
499+
500+
if states and st not in states:
501+
continue
502+
503+
try:
504+
bt(task, **kwargs)
505+
except (ValueError, FaultError) as e:
506+
# Catch ValueError for unwinding running tasks on live
507+
# systems. Print the task & comm but don't unwind.
508+
print_task_header(task)
509+
print(f"Unwind error: {str(e)}")
510+
print()
494511

495512

496513
class Bt(CorelensModule):
497514
"""
498-
Module to print stack trace of all tasks
515+
Print a stack trace for a task, CPU, or set of tasks
499516
"""
500517

501518
name = "bt"
502519

520+
# For normal reports, output on-CPU tasks, then D-state tasks
521+
default_args = [["-a"], ["-s", "D"]]
522+
523+
# For verbose reports, output on-CPU tasks, followed by all tasks
524+
verbose_args = [["-a"], ["-A"]]
525+
526+
def add_args(self, parser: argparse.ArgumentParser) -> None:
527+
fmt = parser.add_argument_group(
528+
description="Options controlling the format of the backtrace",
529+
)
530+
fmt.add_argument(
531+
"--variables",
532+
"-v",
533+
action="store_true",
534+
help="show variable values, where possible",
535+
)
536+
op = parser.add_mutually_exclusive_group()
537+
op.add_argument(
538+
"pid_or_task",
539+
type=str,
540+
nargs="?",
541+
help="A PID, or address of a task_struct to unwind",
542+
)
543+
op.add_argument(
544+
"-c",
545+
"--cpu",
546+
type=int,
547+
help="Print a backtrace for CPU",
548+
)
549+
op.add_argument(
550+
"-a",
551+
"--all-on-cpu",
552+
action="store_true",
553+
help=(
554+
"Print all tasks currently executing on CPU. Not supported "
555+
"for live systems: ignored in that case."
556+
),
557+
)
558+
op.add_argument(
559+
"--state",
560+
"-s",
561+
action="append",
562+
help=("Print all tasks on the system which are in STATE."),
563+
)
564+
op.add_argument(
565+
"--all-tasks",
566+
"-A",
567+
action="store_true",
568+
help="Print all tasks on the system",
569+
)
570+
503571
def run(self, prog: Program, args: argparse.Namespace) -> None:
504-
print_all_bt(prog)
572+
kwargs = {
573+
"show_vars": args.variables,
574+
}
575+
if args.cpu is not None:
576+
if prog.flags & ProgramFlags.IS_LIVE:
577+
print("On-CPU tasks are not supported for /proc/kcore")
578+
else:
579+
bt(prog, cpu=args.cpu, **kwargs)
580+
elif args.pid_or_task is not None:
581+
try:
582+
task = prog.thread(int(args.pid_or_task, 10)).object
583+
except ValueError:
584+
task_ptr = int(args.pid_or_task, 16)
585+
task = Object(prog, "struct task_struct *", value=task_ptr)
586+
bt(task, **kwargs)
587+
elif args.all_on_cpu:
588+
if prog.flags & ProgramFlags.IS_LIVE:
589+
print("On-CPU tasks not supported for /proc/kcore")
590+
else:
591+
print("On-CPU tasks:")
592+
print_online_bt(prog, **kwargs)
593+
elif args.state:
594+
print(f"Tasks in states {', '.join(args.state)}: ")
595+
print_all_bt(prog, states=args.state)
596+
elif args.all_tasks:
597+
print("All tasks:")
598+
print_all_bt(prog)
599+
else:
600+
print("error: select a task or group of tasks to backtrace")

0 commit comments

Comments
 (0)