diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 6a1146ea4d4d885dfa197e4d6aea6562c5d05c7e..4e3d5a9621fe0052fac43d5ad6c109b9d3f54447 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -223,27 +223,48 @@ static unsigned long
 __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
 {
 	struct kprobe *kp;
+	unsigned long faddr;
 
 	kp = get_kprobe((void *)addr);
-	/* There is no probe, return original address */
-	if (!kp)
+	faddr = ftrace_location(addr);
+	/*
+	 * Addresses inside the ftrace location are refused by
+	 * arch_check_ftrace_location(). Something went terribly wrong
+	 * if such an address is checked here.
+	 */
+	if (WARN_ON(faddr && faddr != addr))
+		return 0UL;
+	/*
+	 * Use the current code if it is not modified by Kprobe
+	 * and it cannot be modified by ftrace.
+	 */
+	if (!kp && !faddr)
 		return addr;
 
 	/*
-	 *  Basically, kp->ainsn.insn has an original instruction.
-	 *  However, RIP-relative instruction can not do single-stepping
-	 *  at different place, __copy_instruction() tweaks the displacement of
-	 *  that instruction. In that case, we can't recover the instruction
-	 *  from the kp->ainsn.insn.
+	 * Basically, kp->ainsn.insn has an original instruction.
+	 * However, RIP-relative instruction can not do single-stepping
+	 * at different place, __copy_instruction() tweaks the displacement of
+	 * that instruction. In that case, we can't recover the instruction
+	 * from the kp->ainsn.insn.
 	 *
-	 *  On the other hand, kp->opcode has a copy of the first byte of
-	 *  the probed instruction, which is overwritten by int3. And
-	 *  the instruction at kp->addr is not modified by kprobes except
-	 *  for the first byte, we can recover the original instruction
-	 *  from it and kp->opcode.
+	 * On the other hand, in case on normal Kprobe, kp->opcode has a copy
+	 * of the first byte of the probed instruction, which is overwritten
+	 * by int3. And the instruction at kp->addr is not modified by kprobes
+	 * except for the first byte, we can recover the original instruction
+	 * from it and kp->opcode.
+	 *
+	 * In case of Kprobes using ftrace, we do not have a copy of
+	 * the original instruction. In fact, the ftrace location might
+	 * be modified at anytime and even could be in an inconsistent state.
+	 * Fortunately, we know that the original code is the ideal 5-byte
+	 * long NOP.
 	 */
-	memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
-	buf[0] = kp->opcode;
+	memcpy(buf, (void *)addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
+	if (faddr)
+		memcpy(buf, ideal_nops[NOP_ATOMIC5], 5);
+	else
+		buf[0] = kp->opcode;
 	return (unsigned long)buf;
 }
 
@@ -251,6 +272,7 @@ __recover_probed_insn(kprobe_opcode_t *buf, unsigned long addr)
  * Recover the probed instruction at addr for further analysis.
  * Caller must lock kprobes by kprobe_mutex, or disable preemption
  * for preventing to release referencing kprobes.
+ * Returns zero if the instruction can not get recovered.
  */
 unsigned long recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
 {
@@ -285,6 +307,8 @@ static int can_probe(unsigned long paddr)
 		 * normally used, we just go through if there is no kprobe.
 		 */
 		__addr = recover_probed_instruction(buf, addr);
+		if (!__addr)
+			return 0;
 		kernel_insn_init(&insn, (void *)__addr, MAX_INSN_SIZE);
 		insn_get_length(&insn);
 
@@ -333,6 +357,8 @@ int __copy_instruction(u8 *dest, u8 *src)
 	unsigned long recovered_insn =
 		recover_probed_instruction(buf, (unsigned long)src);
 
+	if (!recovered_insn)
+		return 0;
 	kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
 	insn_get_length(&insn);
 	/* Another subsystem puts a breakpoint, failed to recover */
diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c
index 0dd8d089c315e0e9df338e9144799c6e900df831..7b3b9d15c47a63953d6932026cc57db795e3a507 100644
--- a/arch/x86/kernel/kprobes/opt.c
+++ b/arch/x86/kernel/kprobes/opt.c
@@ -259,6 +259,8 @@ static int can_optimize(unsigned long paddr)
 			 */
 			return 0;
 		recovered_insn = recover_probed_instruction(buf, addr);
+		if (!recovered_insn)
+			return 0;
 		kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE);
 		insn_get_length(&insn);
 		/* Another subsystem puts a breakpoint */
diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index 6c14afe8c1b18ea1ab4a50aa4b6dc46925d304a1..db1d3a29d97fec67c47a4dd84eca2f7779ec1865 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -289,7 +289,7 @@ static u64 do_memcpy_cycle(const struct routine *r, size_t len, bool prefault)
 	memcpy_t fn = r->fn.memcpy;
 	int i;
 
-	memcpy_alloc_mem(&src, &dst, len);
+	memcpy_alloc_mem(&dst, &src, len);
 
 	if (prefault)
 		fn(dst, src, len);
@@ -312,7 +312,7 @@ static double do_memcpy_gettimeofday(const struct routine *r, size_t len,
 	void *src = NULL, *dst = NULL;
 	int i;
 
-	memcpy_alloc_mem(&src, &dst, len);
+	memcpy_alloc_mem(&dst, &src, len);
 
 	if (prefault)
 		fn(dst, src, len);
diff --git a/tools/perf/config/Makefile.arch b/tools/perf/config/Makefile.arch
index ff95a68741d1ccdb54e54d929f2292e88963a5d1..ac8721ffa6c8c681ccdb563607d5b80917062438 100644
--- a/tools/perf/config/Makefile.arch
+++ b/tools/perf/config/Makefile.arch
@@ -21,6 +21,10 @@ ifeq ($(RAW_ARCH),x86_64)
   endif
 endif
 
+ifeq ($(RAW_ARCH),sparc64)
+  ARCH ?= sparc
+endif
+
 ARCH ?= $(RAW_ARCH)
 
 LP64 := $(shell echo __LP64__ | ${CC} ${CFLAGS} -E -x c - | tail -n 1)
diff --git a/tools/perf/config/feature-checks/Makefile b/tools/perf/config/feature-checks/Makefile
index 42ac05aaf8ac1a8bf004c1e664aa10a1853111bc..b32ff3372514da7da33da49cfc2d52755f1db935 100644
--- a/tools/perf/config/feature-checks/Makefile
+++ b/tools/perf/config/feature-checks/Makefile
@@ -49,7 +49,7 @@ test-hello.bin:
 	$(BUILD)
 
 test-pthread-attr-setaffinity-np.bin:
-	$(BUILD) -Werror -lpthread
+	$(BUILD) -D_GNU_SOURCE -Werror -lpthread
 
 test-stackprotector-all.bin:
 	$(BUILD) -Werror -fstack-protector-all
diff --git a/tools/perf/config/feature-checks/test-pthread-attr-setaffinity-np.c b/tools/perf/config/feature-checks/test-pthread-attr-setaffinity-np.c
index 0a0d3ecb4e8af81b77222c29f92efcbf4485313b..2b81b72eca23726a03b263fb21efa649c13c9495 100644
--- a/tools/perf/config/feature-checks/test-pthread-attr-setaffinity-np.c
+++ b/tools/perf/config/feature-checks/test-pthread-attr-setaffinity-np.c
@@ -5,10 +5,11 @@ int main(void)
 {
 	int ret = 0;
 	pthread_attr_t thread_attr;
+	cpu_set_t cs;
 
 	pthread_attr_init(&thread_attr);
 	/* don't care abt exact args, just the API itself in libpthread */
-	ret = pthread_attr_setaffinity_np(&thread_attr, 0, NULL);
+	ret = pthread_attr_setaffinity_np(&thread_attr, sizeof(cs), &cs);
 
 	return ret;
 }
diff --git a/tools/perf/util/cloexec.c b/tools/perf/util/cloexec.c
index 47b78b3f03257385023629696dda95e57d2b5ab7..6da965bdbc2caf8762b5018c6837422a00aac456 100644
--- a/tools/perf/util/cloexec.c
+++ b/tools/perf/util/cloexec.c
@@ -25,6 +25,10 @@ static int perf_flag_probe(void)
 	if (cpu < 0)
 		cpu = 0;
 
+	/*
+	 * Using -1 for the pid is a workaround to avoid gratuitous jump label
+	 * changes.
+	 */
 	while (1) {
 		/* check cloexec flag */
 		fd = sys_perf_event_open(&attr, pid, cpu, -1,
@@ -47,16 +51,24 @@ static int perf_flag_probe(void)
 		  err, strerror_r(err, sbuf, sizeof(sbuf)));
 
 	/* not supported, confirm error related to PERF_FLAG_FD_CLOEXEC */
-	fd = sys_perf_event_open(&attr, pid, cpu, -1, 0);
+	while (1) {
+		fd = sys_perf_event_open(&attr, pid, cpu, -1, 0);
+		if (fd < 0 && pid == -1 && errno == EACCES) {
+			pid = 0;
+			continue;
+		}
+		break;
+	}
 	err = errno;
 
+	if (fd >= 0)
+		close(fd);
+
 	if (WARN_ONCE(fd < 0 && err != EBUSY,
 		      "perf_event_open(..., 0) failed unexpectedly with error %d (%s)\n",
 		      err, strerror_r(err, sbuf, sizeof(sbuf))))
 		return -1;
 
-	close(fd);
-
 	return 0;
 }
 
diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h
index c94a9e03ecf15744800d4a6bc68cca28ca70259e..e99a67632831a8e6548fae8a40b654f01b009d1e 100644
--- a/tools/perf/util/evlist.h
+++ b/tools/perf/util/evlist.h
@@ -28,7 +28,7 @@ struct perf_mmap {
 	int		 mask;
 	int		 refcnt;
 	unsigned int	 prev;
-	char		 event_copy[PERF_SAMPLE_MAX_SIZE];
+	char		 event_copy[PERF_SAMPLE_MAX_SIZE] __attribute__((aligned(8)));
 };
 
 struct perf_evlist {
diff --git a/tools/perf/util/symbol-elf.c b/tools/perf/util/symbol-elf.c
index b24f9d8727a894ccae13353abad0b951ec1b0916..33b7a2aef71322ab88b93e675c070bfc4a7c7da7 100644
--- a/tools/perf/util/symbol-elf.c
+++ b/tools/perf/util/symbol-elf.c
@@ -11,6 +11,11 @@
 #include <symbol/kallsyms.h>
 #include "debug.h"
 
+#ifndef EM_AARCH64
+#define EM_AARCH64	183  /* ARM 64 bit */
+#endif
+
+
 #ifdef HAVE_CPLUS_DEMANGLE_SUPPORT
 extern char *cplus_demangle(const char *, int);