Select Git revision
-
Borislav Petkov authored
Commit: a33d3317 ("x86/CPU/AMD: Fix Bulldozer topology") restored the initial approach we had with the Fam15h topology of enumerating CU (Compute Unit) threads as cores. And this is still correct - they're beefier than HT threads but still have some shared functionality. Our current approach has a problem with the Mad Max Steam game, for example. Yves Dionne reported a certain "choppiness" while playing on v4.9.5. That problem stems most likely from the fact that the CU threads share resources within one CU and when we schedule to a thread of a different compute unit, this incurs latency due to migrating the working set to a different CU through the caches. When the thread siblings mask mirrors that aspect of the CUs and threads, the scheduler pays attention to it and tries to schedule within one CU first. Which takes care of the latency, of course. Reported-by:
Yves Dionne <yves.dionne@gmail.com> Signed-off-by:
Borislav Petkov <bp@suse.de> Cc: <stable@vger.kernel.org> # 4.9 Cc: Brice Goglin <Brice.Goglin@inria.fr> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Yazen Ghannam <yazen.ghannam@amd.com> Link: http://lkml.kernel.org/r/20170205105022.8705-1-bp@alien8.de Signed-off-by:
Ingo Molnar <mingo@kernel.org>
Borislav Petkov authoredCommit: a33d3317 ("x86/CPU/AMD: Fix Bulldozer topology") restored the initial approach we had with the Fam15h topology of enumerating CU (Compute Unit) threads as cores. And this is still correct - they're beefier than HT threads but still have some shared functionality. Our current approach has a problem with the Mad Max Steam game, for example. Yves Dionne reported a certain "choppiness" while playing on v4.9.5. That problem stems most likely from the fact that the CU threads share resources within one CU and when we schedule to a thread of a different compute unit, this incurs latency due to migrating the working set to a different CU through the caches. When the thread siblings mask mirrors that aspect of the CUs and threads, the scheduler pays attention to it and tries to schedule within one CU first. Which takes care of the latency, of course. Reported-by:
Yves Dionne <yves.dionne@gmail.com> Signed-off-by:
Borislav Petkov <bp@suse.de> Cc: <stable@vger.kernel.org> # 4.9 Cc: Brice Goglin <Brice.Goglin@inria.fr> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Yazen Ghannam <yazen.ghannam@amd.com> Link: http://lkml.kernel.org/r/20170205105022.8705-1-bp@alien8.de Signed-off-by:
Ingo Molnar <mingo@kernel.org>
bitops_32.h 2.88 KiB
/*
* bitops.h: Bit string operations on the Sparc.
*
* Copyright 1995 David S. Miller (davem@caip.rutgers.edu)
* Copyright 1996 Eddie C. Dost (ecd@skynet.be)
* Copyright 2001 Anton Blanchard (anton@samba.org)
*/
#ifndef _SPARC_BITOPS_H
#define _SPARC_BITOPS_H
#include <linux/compiler.h>
#include <asm/byteorder.h>
#ifdef __KERNEL__
#ifndef _LINUX_BITOPS_H
#error only <linux/bitops.h> can be included directly
#endif
extern unsigned long ___set_bit(unsigned long *addr, unsigned long mask);
extern unsigned long ___clear_bit(unsigned long *addr, unsigned long mask);
extern unsigned long ___change_bit(unsigned long *addr, unsigned long mask);
/*
* Set bit 'nr' in 32-bit quantity at address 'addr' where bit '0'
* is in the highest of the four bytes and bit '31' is the high bit
* within the first byte. Sparc is BIG-Endian. Unless noted otherwise
* all bit-ops return 0 if bit was previously clear and != 0 otherwise.
*/
static inline int test_and_set_bit(unsigned long nr, volatile unsigned long *addr)
{
unsigned long *ADDR, mask;
ADDR = ((unsigned long *) addr) + (nr >> 5);
mask = 1 << (nr & 31);
return ___set_bit(ADDR, mask) != 0;
}
static inline void set_bit(unsigned long nr, volatile unsigned long *addr)
{
unsigned long *ADDR, mask;
ADDR = ((unsigned long *) addr) + (nr >> 5);
mask = 1 << (nr & 31);
(void) ___set_bit(ADDR, mask);
}
static inline int test_and_clear_bit(unsigned long nr, volatile unsigned long *addr)
{
unsigned long *ADDR, mask;
ADDR = ((unsigned long *) addr) + (nr >> 5);
mask = 1 << (nr & 31);
return ___clear_bit(ADDR, mask) != 0;
}
static inline void clear_bit(unsigned long nr, volatile unsigned long *addr)
{
unsigned long *ADDR, mask;
ADDR = ((unsigned long *) addr) + (nr >> 5);
mask = 1 << (nr & 31);
(void) ___clear_bit(ADDR, mask);
}
static inline int test_and_change_bit(unsigned long nr, volatile unsigned long *addr)
{
unsigned long *ADDR, mask;
ADDR = ((unsigned long *) addr) + (nr >> 5);
mask = 1 << (nr & 31);
return ___change_bit(ADDR, mask) != 0;
}
static inline void change_bit(unsigned long nr, volatile unsigned long *addr)
{
unsigned long *ADDR, mask;
ADDR = ((unsigned long *) addr) + (nr >> 5);
mask = 1 << (nr & 31);
(void) ___change_bit(ADDR, mask);
}
#include <asm-generic/bitops/non-atomic.h>
#define smp_mb__before_clear_bit() do { } while(0)
#define smp_mb__after_clear_bit() do { } while(0)
#include <asm-generic/bitops/ffz.h>
#include <asm-generic/bitops/__ffs.h>
#include <asm-generic/bitops/sched.h>
#include <asm-generic/bitops/ffs.h>
#include <asm-generic/bitops/fls.h>
#include <asm-generic/bitops/fls64.h>
#include <asm-generic/bitops/hweight.h>
#include <asm-generic/bitops/lock.h>
#include <asm-generic/bitops/find.h>
#include <asm-generic/bitops/ext2-non-atomic.h>
#include <asm-generic/bitops/ext2-atomic.h>
#include <asm-generic/bitops/minix.h>
#endif /* __KERNEL__ */
#endif /* defined(_SPARC_BITOPS_H) */