git.ipfire.org Git - thirdparty/linux.git/blob

1 // SPDX-License-Identifier: GPL-2.0

2 /*

3 * Augment the raw_syscalls tracepoints with the contents of the pointer arguments.

4 *

5 * Test it with:

6 *

7 * perf trace -e tools/perf/examples/bpf/augmented_raw_syscalls.c cat /etc/passwd > /dev/null

8 *

9 * This exactly matches what is marshalled into the raw_syscall:sys_enter

10 * payload expected by the 'perf trace' beautifiers.

11 *

12 * For now it just uses the existing tracepoint augmentation code in 'perf

13 * trace', in the next csets we'll hook up these with the sys_enter/sys_exit

14 * code that will combine entry/exit in a strace like way.

15 */

17 #include <unistd.h>

18 #include <pid_filter.h>

20 /* bpf-output associated map */

21 struct bpf_map SEC("maps") __augmented_syscalls__ = {

22 .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,

23 .key_size = sizeof(int),

24 .value_size = sizeof(u32),

25 .max_entries = __NR_CPUS__,

26 };

28 struct syscall {

29 bool enabled;

30 };

32 struct bpf_map SEC("maps") syscalls = {

33 .type = BPF_MAP_TYPE_ARRAY,

34 .key_size = sizeof(int),

35 .value_size = sizeof(struct syscall),

36 .max_entries = 512,

37 };

39 struct syscall_enter_args {

40 unsigned long long common_tp_fields;

41 long syscall_nr;

42 unsigned long args[6];

43 };

45 struct syscall_exit_args {

46 unsigned long long common_tp_fields;

47 long syscall_nr;

48 long ret;

49 };

51 struct augmented_filename {

52 unsigned int size;

53 int reserved;

54 char value[256];

55 };

57 #define SYS_OPEN 2

58 #define SYS_ACCESS 21

59 #define SYS_OPENAT 257

61 pid_filter(pids_filtered);

63 SEC("raw_syscalls:sys_enter")

64 int sys_enter(struct syscall_enter_args *args)

65 {

66 struct {

67 struct syscall_enter_args args;

68 struct augmented_filename filename;

69 } augmented_args;

70 struct syscall *syscall;

71 unsigned int len = sizeof(augmented_args);

72 const void *filename_arg = NULL;

74 if (pid_filter__has(&pids_filtered, getpid()))

75 return 0;

77 probe_read(&augmented_args.args, sizeof(augmented_args.args), args);

79 syscall = bpf_map_lookup_elem(&syscalls, &augmented_args.args.syscall_nr);

80 if (syscall == NULL || !syscall->enabled)

81 return 0;

82 /*

83 * Yonghong and Edward Cree sayz:

84 *

85 * https://www.spinics.net/lists/netdev/msg531645.html

86 *

87 * >> R0=inv(id=0) R1=inv2 R6=ctx(id=0,off=0,imm=0) R7=inv64 R10=fp0,call_-1

88 * >> 10: (bf) r1 = r6

89 * >> 11: (07) r1 += 16

90 * >> 12: (05) goto pc+2

91 * >> 15: (79) r3 = *(u64 *)(r1 +0)

92 * >> dereference of modified ctx ptr R1 off=16 disallowed

93 * > Aha, we at least got a different error message this time.

94 * > And indeed llvm has done that optimisation, rather than the more obvious

95 * > 11: r3 = *(u64 *)(r1 +16)

96 * > because it wants to have lots of reads share a single insn. You may be able

97 * > to defeat that optimisation by adding compiler barriers, idk. Maybe someone

98 * > with llvm knowledge can figure out how to stop it (ideally, llvm would know

99 * > when it's generating for bpf backend and not do that). -O0? ¯\_(ツ)_/¯

100 *

101 * The optimization mostly likes below:

102 *

103 * br1:

104 * ...

105 * r1 += 16

106 * goto merge

107 * br2:

108 * ...

109 * r1 += 20

110 * goto merge

111 * merge:

112 * *(u64 *)(r1 + 0)

113 *

114 * The compiler tries to merge common loads. There is no easy way to

115 * stop this compiler optimization without turning off a lot of other

116 * optimizations. The easiest way is to add barriers:

117 *

118 * __asm__ __volatile__("": : :"memory")

119 *

120 * after the ctx memory access to prevent their down stream merging.

121 */

122 switch (augmented_args.args.syscall_nr) {

123 case SYS_ACCESS:

124 case SYS_OPEN: filename_arg = (const void *)args->args[0];

125 __asm__ __volatile__("": : :"memory");

126 break;

127 case SYS_OPENAT: filename_arg = (const void *)args->args[1];

128 break;

129 }

130

131 if (filename_arg != NULL) {

132 augmented_args.filename.reserved = 0;

133 augmented_args.filename.size = probe_read_str(&augmented_args.filename.value,

134 sizeof(augmented_args.filename.value),

135 filename_arg);

136 if (augmented_args.filename.size < sizeof(augmented_args.filename.value)) {

137 len -= sizeof(augmented_args.filename.value) - augmented_args.filename.size;

138 len &= sizeof(augmented_args.filename.value) - 1;

139 }

140 } else {

141 len = sizeof(augmented_args.args);

142 }

143

144 perf_event_output(args, &__augmented_syscalls__, BPF_F_CURRENT_CPU, &augmented_args, len);

145 return 0;

146 }

147

148 SEC("raw_syscalls:sys_exit")

149 int sys_exit(struct syscall_exit_args *args)

150 {

151 struct syscall_exit_args exit_args;

152 struct syscall *syscall;

153

154 if (pid_filter__has(&pids_filtered, getpid()))

155 return 0;

156

157 probe_read(&exit_args, sizeof(exit_args), args);

158

159 syscall = bpf_map_lookup_elem(&syscalls, &exit_args.syscall_nr);

160 if (syscall == NULL || !syscall->enabled)

161 return 0;

162

163 return 1;

164 }

165

166 license(GPL);