Skip to content
Snippets Groups Projects
Select Git revision
  • 6fd166aae78c0ab738d49bda653cbd9e3b1491cf
  • master default
  • android-container
  • nanopc-t4
  • for-kernelci
  • WIP-syscall
  • v4.16-rc5
  • v4.16-rc4
  • v4.16-rc3
  • v4.16-rc2
  • v4.16-rc1
  • v4.15
  • v4.15-rc9
  • v4.15-rc8
  • v4.15-rc7
  • v4.15-rc6
  • v4.15-rc5
  • v4.15-rc4
  • v4.15-rc3
  • v4.15-rc2
  • v4.15-rc1
  • v4.14
  • v4.14-rc8
  • v4.14-rc7
  • v4.14-rc6
  • v4.14-rc5
26 results

calling.h

Blame
    • Peter Zijlstra's avatar
      6fd166aa
      x86/mm: Use/Fix PCID to optimize user/kernel switches · 6fd166aa
      Peter Zijlstra authored
      
      We can use PCID to retain the TLBs across CR3 switches; including those now
      part of the user/kernel switch. This increases performance of kernel
      entry/exit at the cost of more expensive/complicated TLB flushing.
      
      Now that we have two address spaces, one for kernel and one for user space,
      we need two PCIDs per mm. We use the top PCID bit to indicate a user PCID
      (just like we use the PFN LSB for the PGD). Since we do TLB invalidation
      from kernel space, the existing code will only invalidate the kernel PCID,
      we augment that by marking the corresponding user PCID invalid, and upon
      switching back to userspace, use a flushing CR3 write for the switch.
      
      In order to access the user_pcid_flush_mask we use PER_CPU storage, which
      means the previously established SWAPGS vs CR3 ordering is now mandatory
      and required.
      
      Having to do this memory access does require additional registers, most
      sites have a functioning stack and we can spill one (RAX), sites without
      functional stack need to otherwise provide the second scratch register.
      
      Note: PCID is generally available on Intel Sandybridge and later CPUs.
      Note: Up until this point TLB flushing was broken in this series.
      
      Based-on-code-from: Dave Hansen <dave.hansen@linux.intel.com>
      Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
      Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
      Cc: Andy Lutomirski <luto@kernel.org>
      Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
      Cc: Borislav Petkov <bp@alien8.de>
      Cc: Brian Gerst <brgerst@gmail.com>
      Cc: Dave Hansen <dave.hansen@linux.intel.com>
      Cc: David Laight <David.Laight@aculab.com>
      Cc: Denys Vlasenko <dvlasenk@redhat.com>
      Cc: Eduardo Valentin <eduval@amazon.com>
      Cc: Greg KH <gregkh@linuxfoundation.org>
      Cc: H. Peter Anvin <hpa@zytor.com>
      Cc: Josh Poimboeuf <jpoimboe@redhat.com>
      Cc: Juergen Gross <jgross@suse.com>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Will Deacon <will.deacon@arm.com>
      Cc: aliguori@amazon.com
      Cc: daniel.gruss@iaik.tugraz.at
      Cc: hughd@google.com
      Cc: keescook@google.com
      Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
      6fd166aa
      History
      x86/mm: Use/Fix PCID to optimize user/kernel switches
      Peter Zijlstra authored
      
      We can use PCID to retain the TLBs across CR3 switches; including those now
      part of the user/kernel switch. This increases performance of kernel
      entry/exit at the cost of more expensive/complicated TLB flushing.
      
      Now that we have two address spaces, one for kernel and one for user space,
      we need two PCIDs per mm. We use the top PCID bit to indicate a user PCID
      (just like we use the PFN LSB for the PGD). Since we do TLB invalidation
      from kernel space, the existing code will only invalidate the kernel PCID,
      we augment that by marking the corresponding user PCID invalid, and upon
      switching back to userspace, use a flushing CR3 write for the switch.
      
      In order to access the user_pcid_flush_mask we use PER_CPU storage, which
      means the previously established SWAPGS vs CR3 ordering is now mandatory
      and required.
      
      Having to do this memory access does require additional registers, most
      sites have a functioning stack and we can spill one (RAX), sites without
      functional stack need to otherwise provide the second scratch register.
      
      Note: PCID is generally available on Intel Sandybridge and later CPUs.
      Note: Up until this point TLB flushing was broken in this series.
      
      Based-on-code-from: Dave Hansen <dave.hansen@linux.intel.com>
      Signed-off-by: default avatarPeter Zijlstra (Intel) <peterz@infradead.org>
      Signed-off-by: default avatarThomas Gleixner <tglx@linutronix.de>
      Cc: Andy Lutomirski <luto@kernel.org>
      Cc: Boris Ostrovsky <boris.ostrovsky@oracle.com>
      Cc: Borislav Petkov <bp@alien8.de>
      Cc: Brian Gerst <brgerst@gmail.com>
      Cc: Dave Hansen <dave.hansen@linux.intel.com>
      Cc: David Laight <David.Laight@aculab.com>
      Cc: Denys Vlasenko <dvlasenk@redhat.com>
      Cc: Eduardo Valentin <eduval@amazon.com>
      Cc: Greg KH <gregkh@linuxfoundation.org>
      Cc: H. Peter Anvin <hpa@zytor.com>
      Cc: Josh Poimboeuf <jpoimboe@redhat.com>
      Cc: Juergen Gross <jgross@suse.com>
      Cc: Linus Torvalds <torvalds@linux-foundation.org>
      Cc: Peter Zijlstra <peterz@infradead.org>
      Cc: Will Deacon <will.deacon@arm.com>
      Cc: aliguori@amazon.com
      Cc: daniel.gruss@iaik.tugraz.at
      Cc: hughd@google.com
      Cc: keescook@google.com
      Signed-off-by: default avatarIngo Molnar <mingo@kernel.org>
    tracelog.c 3.15 KiB
    // SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
    /* Copyright (c) 2015-2017 Daniel Borkmann */
    /* Copyright (c) 2018 Netronome Systems, Inc. */
    
    #include <errno.h>
    #include <limits.h>
    #include <signal.h>
    #include <stdio.h>
    #include <string.h>
    #include <unistd.h>
    #include <linux/magic.h>
    #include <sys/fcntl.h>
    #include <sys/vfs.h>
    
    #include "main.h"
    
    #ifndef TRACEFS_MAGIC
    # define TRACEFS_MAGIC	0x74726163
    #endif
    
    #define _textify(x)	#x
    #define textify(x)	_textify(x)
    
    FILE *trace_pipe_fd;
    char *buff;
    
    static int validate_tracefs_mnt(const char *mnt, unsigned long magic)
    {
    	struct statfs st_fs;
    
    	if (statfs(mnt, &st_fs) < 0)
    		return -ENOENT;
    	if ((unsigned long)st_fs.f_type != magic)
    		return -ENOENT;
    
    	return 0;
    }
    
    static bool
    find_tracefs_mnt_single(unsigned long magic, char *mnt, const char *mntpt)
    {
    	size_t src_len;
    
    	if (validate_tracefs_mnt(mntpt, magic))
    		return false;
    
    	src_len = strlen(mntpt);
    	if (src_len + 1 >= PATH_MAX) {
    		p_err("tracefs mount point name too long");
    		return false;
    	}
    
    	strcpy(mnt, mntpt);
    	return true;
    }
    
    static bool find_tracefs_pipe(char *mnt)
    {
    	static const char * const known_mnts[] = {
    		"/sys/kernel/debug/tracing",
    		"/sys/kernel/tracing",
    		"/tracing",
    		"/trace",
    	};
    	const char *pipe_name = "/trace_pipe";
    	const char *fstype = "tracefs";
    	char type[100], format[32];
    	const char * const *ptr;
    	bool found = false;
    	FILE *fp;
    
    	for (ptr = known_mnts; ptr < known_mnts + ARRAY_SIZE(known_mnts); ptr++)
    		if (find_tracefs_mnt_single(TRACEFS_MAGIC, mnt, *ptr))
    			goto exit_found;
    
    	fp = fopen("/proc/mounts", "r");
    	if (!fp)
    		return false;
    
    	/* Allow room for NULL terminating byte and pipe file name */
    	snprintf(format, sizeof(format), "%%*s %%%zds %%99s %%*s %%*d %%*d\\n",
    		 PATH_MAX - strlen(pipe_name) - 1);
    	while (fscanf(fp, format, mnt, type) == 2)
    		if (strcmp(type, fstype) == 0) {
    			found = true;
    			break;
    		}
    	fclose(fp);
    
    	/* The string from fscanf() might be truncated, check mnt is valid */
    	if (!found || validate_tracefs_mnt(mnt, TRACEFS_MAGIC))
    		return false;
    
    exit_found:
    	strcat(mnt, pipe_name);
    	return true;
    }
    
    static void exit_tracelog(int signum)
    {
    	fclose(trace_pipe_fd);
    	free(buff);
    
    	if (json_output) {
    		jsonw_end_array(json_wtr);
    		jsonw_destroy(&json_wtr);
    	}
    
    	exit(0);
    }
    
    int do_tracelog(int argc, char **argv)
    {
    	const struct sigaction act = {
    		.sa_handler = exit_tracelog
    	};
    	char trace_pipe[PATH_MAX];
    	bool found_trace_pipe;
    	size_t buff_len = 0;
    
    	if (json_output)
    		jsonw_start_array(json_wtr);
    
    	found_trace_pipe = find_tracefs_pipe(trace_pipe);
    	if (!found_trace_pipe) {
    		p_err("could not find trace pipe, tracefs not mounted?");
    		return -1;
    	}
    
    	trace_pipe_fd = fopen(trace_pipe, "r");
    	if (!trace_pipe_fd) {
    		p_err("could not open trace pipe: %s", strerror(errno));
    		return -1;
    	}
    
    	sigaction(SIGHUP, &act, NULL);
    	sigaction(SIGINT, &act, NULL);
    	sigaction(SIGTERM, &act, NULL);
    	while (1) {
    		ssize_t ret;
    
    		ret = getline(&buff, &buff_len, trace_pipe_fd);
    		if (ret <= 0) {
    			p_err("failed to read content from trace pipe: %s",
    			      strerror(errno));
    			break;
    		}
    		if (json_output)
    			jsonw_string(json_wtr, buff);
    		else
    			printf("%s", buff);
    	}
    
    	fclose(trace_pipe_fd);
    	free(buff);
    	return -1;
    }