Skip to content
  • Arnaldo Carvalho de Melo's avatar
    perf annotate: Use absolute addresses to calculate jump target offsets · 980b68ec
    Arnaldo Carvalho de Melo authored
    These types of jumps were confusing the annotate browser:
    
    entry_SYSCALL_64  /lib/modules/4.16.0-rc5-00086-gdf09348f/build/vmlinux
    
    entry_SYSCALL_64  /lib/modules/4.16.0-rc5-00086-gdf09348f/build/vmlinux
      Percent│ffffffff81a00020:   swapgs
      <SNIP>
             │ffffffff81a00128: ↓ jae    ffffffff81a00139 <syscall_return_via_sysret+0x53>
      <SNIP>
             │ffffffff81a00155: → jmpq   *0x825d2d(%rip)   # ffffffff82225e88 <pv_cpu_ops+0xe8>
    
    I.e. the syscall_return_via_sysret function is actually "inside" the
    entry_SYSCALL_64 function, and the offsets in jumps like these (+0x53)
    are relative to syscall_return_via_sysret, not to syscall_return_via_sysret.
    
    Or this may be some artifact in how the assembler marks the start and
    end of a function and how this ends up in the ELF symtab for vmlinux,
    i.e. syscall_return_via_sysret() isn't "inside" entry_SYSCALL_64, but
    just right after it.
    
    From readelf -sw vmlinux:
    
     80267: ffffffff81a00020   315 NOTYPE  GLOBAL DEFAULT    1 entry_SYSCALL_64
       316: ffffffff81a000e6     0 NOTYPE  LOCAL  DEFAULT    1 syscall_return_via_sysret
    
     0xffffffff81a00020 + 315 > 0xffffffff81a000e6
    
    So instead of looking for offsets after that last '+' sign, calculate
    offsets for jump target addresses that are inside the function being
    disassembled from the absolute address, 0xffffffff81a00139 in this case,
    subtracting from it the objdump address for the start of the function
    being disassembled, entry_SYSCALL_64() in this case.
    
    So, before this patch:
    
    entry_SYSCALL_64  /lib/modules/4.16.0-rc5-00086-gdf09348f/build/vmlinux
    Percent│       pop    %r10
           │       pop    %r9
           │       pop    %r8
           │       pop    %rax
           │       pop    %rsi
           │       pop    %rdx
           │       pop    %rsi
           │       mov    %rsp,%rdi
           │       mov    %gs:0x5004,%rsp
           │       pushq  0x28(%rdi)
           │       pushq  (%rdi)
           │       push   %rax
           │     ↑ jmp    6c
           │       mov    %cr3,%rdi
           │     ↑ jmp    62
           │       mov    %rdi,%rax
           │       and    $0x7ff,%rdi
           │       bt     %rdi,%gs:0x2219a
           │     ↑ jae    53
           │       btr    %rdi,%gs:0x2219a
           │       mov    %rax,%rdi
           │     ↑ jmp    5b
    
    After:
    
    entry_SYSCALL_64  /lib/modules/4.16.0-rc5-00086-gdf09348f/build/vmlinux
      0.65 │     → jne    swapgs_restore_regs_and_return_to_usermode
           │       pop    %r10
           │       pop    %r9
           │       pop    %r8
           │       pop    %rax
           │       pop    %rsi
           │       pop    %rdx
           │       pop    %rsi
           │       mov    %rsp,%rdi
           │       mov    %gs:0x5004,%rsp
           │       pushq  0x28(%rdi)
           │       pushq  (%rdi)
           │       push   %rax
           │     ↓ jmp    132
           │       mov    %cr3,%rdi
           │    ┌──jmp    128
           │    │  mov    %rdi,%rax
           │    │  and    $0x7ff,%rdi
           │    │  bt     %rdi,%gs:0x2219a
           │    │↓ jae    119
           │    │  btr    %rdi,%gs:0x2219a
           │    │  mov    %rax,%rdi
           │    │↓ jmp    121
           │119:│  mov    %rax,%rdi
           │    │  bts    $0x3f,%rdi
           │121:│  or     $0x800,%rdi
           │128:└─→or     $0x1000,%rdi
           │       mov    %rdi,%cr3
           │132:   pop    %rax
           │       pop    %rdi
           │       pop    %rsp
           │     → jmpq   *0x825d2d(%rip)        # ffffffff82225e88 <pv_cpu_ops+0xe8>
    
    With those at least navigating to the right destination, an improvement
    for these cases seems to be to be to somehow mark those inner functions,
    which in this case could be:
    
    entry_SYSCALL_64  /lib/modules/4.16.0-rc5-00086-gdf09348f
    
    /build/vmlinux
           │syscall_return_via_sysret:
           │       pop    %r15
           │       pop    %r14
           │       pop    %r13
           │       pop    %r12
           │       pop    %rbp
           │       pop    %rbx
           │       pop    %rsi
           │       pop    %r10
           │       pop    %r9
           │       pop    %r8
           │       pop    %rax
           │       pop    %rsi
           │       pop    %rdx
           │       pop    %rsi
           │       mov    %rsp,%rdi
           │       mov    %gs:0x5004,%rsp
           │       pushq  0x28(%rdi)
           │       pushq  (%rdi)
           │       push   %rax
           │     ↓ jmp    132
           │       mov    %cr3,%rdi
           │    ┌──jmp    128
           │    │  mov    %rdi,%rax
           │    │  and    $0x7ff,%rdi
           │    │  bt     %rdi,%gs:0x2219a
           │    │↓ jae    119
           │    │  btr    %rdi,%gs:0x2219a
           │    │  mov    %rax,%rdi
           │    │↓ jmp    121
           │119:│  mov    %rax,%rdi
           │    │  bts    $0x3f,%rdi
           │121:│  or     $0x800,%rdi
           │128:└─→or     $0x1000,%rdi
           │       mov    %rdi,%cr3
           │132:   pop    %rax
           │       pop    %rdi
           │       pop    %rsp
           │     → jmpq   *0x825d2d(%rip)        # ffffffff82225e88 <pv_cpu_ops+0xe8>
    
    This all gets much better viewed if one uses 'perf report --ignore-vmlinux'
    forcing the usage of /proc/kcore + /proc/kallsyms, when the above
    actually gets down to:
    
      # perf report --ignore-vmlinux
      ## do '/64', will show the function names containing '64',
      ## navigate to /entry_SYSCALL_64_after_hwframe.annotation,
      ## press 'A' to annotate, then 'P' to print that annotation
      ## to a file
      ## From another xterm (or see on screen, this 'P' thing is for
      ## getting rid of those right side scroll bars/spaces):
      # cat /entry_SYSCALL_64_after_hwframe.annotation
      entry_SYSCALL_64_after_hwframe() /proc/kcore
      Event: cycles:ppp
    
      Percent
                  Disassembly of section load0:
    
                  ffffffff9aa00044 <load0>:
       11.97        push   %rax
        4.85        push   %rdi
                    push   %rsi
        2.59        push   %rdx
        2.27        push   %rcx
        0.32        pushq  $0xffffffffffffffda
        1.29        push   %r8
                    xor    %r8d,%r8d
        1.62        push   %r9
        0.65        xor    %r9d,%r9d
        1.62        push   %r10
                    xor    %r10d,%r10d
        5.50        push   %r11
                    xor    %r11d,%r11d
        3.56        push   %rbx
                    xor    %ebx,%ebx
        4.21        push   %rbp
                    xor    %ebp,%ebp
        2.59        push   %r12
        0.97        xor    %r12d,%r12d
        3.24        push   %r13
                    xor    %r13d,%r13d
        2.27        push   %r14
                    xor    %r14d,%r14d
        4.21        push   %r15
                    xor    %r15d,%r15d
        0.97        mov    %rsp,%rdi
        5.50      → callq  do_syscall_64
       14.56        mov    0x58(%rsp),%rcx
        7.44        mov    0x80(%rsp),%r11
        0.32        cmp    %rcx,%r11
                  → jne    swapgs_restore_regs_and_return_to_usermode
        0.32        shl    $0x10,%rcx
        0.32        sar    $0x10,%rcx
        3.24        cmp    %rcx,%r11
                  → jne    swapgs_restore_regs_and_return_to_usermode
        2.27        cmpq   $0x33,0x88(%rsp)
        1.29      → jne    swapgs_restore_regs_and_return_to_usermode
                    mov    0x30(%rsp),%r11
        8.74        cmp    %r11,0x90(%rsp)
                  → jne    swapgs_restore_regs_and_return_to_usermode
        0.32        test   $0x10100,%r11
                  → jne    swapgs_restore_regs_and_return_to_usermode
        0.32        cmpq   $0x2b,0xa0(%rsp)
        0.65      → jne    swapgs_restore_regs_and_return_to_usermode
    
    I.e. using kallsyms makes the function start/end be done differently
    than using what is in the vmlinux ELF symtab and actually the hits
    goes to entry_SYSCALL_64_after_hwframe, which is a GLOBAL() after the
    start of entry_SYSCALL_64:
    
      ENTRY(entry_SYSCALL_64)
              UNWIND_HINT_EMPTY
      <SNIP>
              pushq   $__USER_CS                      /* pt_regs->cs */
              pushq   %rcx                            /* pt_regs->ip */
      GLOBAL(entry_SYSCALL_64_after_hwframe)
              pushq   %rax                            /* pt_regs->orig_ax */
    
              PUSH_AND_CLEAR_REGS rax=$-ENOSYS
    
    And it goes and ends at:
    
              cmpq    $__USER_DS, SS(%rsp)            /* SS must match SYSRET */
              jne     swapgs_restore_regs_and_return_to_usermode
    
              /*
               * We win! This label is here just for ease of understanding
               * perf profiles. Nothing jumps here.
               */
      syscall_return_via_sysret:
              /* rcx and r11 are already restored (see code above) */
              UNWIND_HINT_EMPTY
              POP_REGS pop_rdi=0 skip_r11rcx=1
    
    So perhaps some people should really just play with '--ignore-vmlinux'
    to force /proc/kcore + kallsyms.
    
    One idea is to do both, i.e. have a vmlinux annotation and a
    kcore+kallsyms one, when possible, and even show the patched location,
    etc.
    
    Reported-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
    Cc: Adrian Hunter <adrian.hunter@intel.com>
    Cc: Andi Kleen <ak@linux.intel.com>
    Cc: David Ahern <dsahern@gmail.com>
    Cc: Jin Yao <yao.jin@linux.intel.com>
    Cc: Jiri Olsa <jolsa@kernel.org>
    Cc: Namhyung Kim <namhyung@kernel.org>
    Cc: Wang Nan <wangnan0@huawei.com>
    Link: https://lkml.kernel.org/n/tip-r11knxv8voesav31xokjiuo6@git.kernel.org
    
    
    Signed-off-by: default avatarArnaldo Carvalho de Melo <acme@redhat.com>
    980b68ec