Commit e6d9bfc6 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge branch 'powernv-cpuidle' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc

Pull powerpc non-virtualized cpuidle from Ben Herrenschmidt:
 "This is the branch I mentioned in my other pull request which contains
  our improved cpuidle support for the "powernv" platform
  (non-virtualized).

  It adds support for the "fast sleep" feature of the processor which
  provides higher power savings than our usual "nap" mode but at the
  cost of losing the timers while asleep, and thus exploits the new
  timer broadcast framework to work around that limitation.

  It's based on a tip timer tree that you seem to have already merged"

* 'powernv-cpuidle' of git://git.kernel.org/pub/scm/linux/kernel/git/benh/powerpc:
  cpuidle/powernv: Parse device tree to setup idle states
  cpuidle/powernv: Add "Fast-Sleep" CPU idle state
  powerpc/powernv: Add OPAL call to resync timebase on wakeup
  powerpc/powernv: Add context management for Fast Sleep
  powerpc: Split timer_interrupt() into timer handling and interrupt handling routines
  powerpc: Implement tick broadcast IPI as a fixed IPI message
  powerpc: Free up the slot of PPC_MSG_CALL_FUNC_SINGLE IPI message
parents 235c7b9f 0888839c
......@@ -130,6 +130,8 @@ config PPC
select GENERIC_CMOS_UPDATE
select GENERIC_TIME_VSYSCALL_OLD
select GENERIC_CLOCKEVENTS
select GENERIC_CLOCKEVENTS_BROADCAST if SMP
select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST
select GENERIC_STRNCPY_FROM_USER
select GENERIC_STRNLEN_USER
select HAVE_MOD_ARCH_SPECIFIC
......
......@@ -161,6 +161,7 @@ extern int opal_enter_rtas(struct rtas_args *args,
#define OPAL_FLASH_VALIDATE 76
#define OPAL_FLASH_MANAGE 77
#define OPAL_FLASH_UPDATE 78
#define OPAL_RESYNC_TIMEBASE 79
#define OPAL_DUMP_INIT 81
#define OPAL_DUMP_INFO 82
#define OPAL_DUMP_READ 83
......@@ -923,6 +924,7 @@ extern int opal_machine_check(struct pt_regs *regs);
extern bool opal_mce_check_early_recovery(struct pt_regs *regs);
extern void opal_shutdown(void);
extern int opal_resync_timebase(void);
extern void opal_lpc_init(void);
......
......@@ -450,6 +450,7 @@ enum idle_boot_override {IDLE_NO_OVERRIDE = 0, IDLE_POWERSAVE_OFF};
extern int powersave_nap; /* set if nap mode can be used in idle loop */
extern void power7_nap(void);
extern void power7_sleep(void);
extern void flush_instruction_cache(void);
extern void hard_reset_now(void);
extern void poweroff_now(void);
......
......@@ -120,7 +120,7 @@ extern int cpu_to_core_id(int cpu);
* in /proc/interrupts will be wrong!!! --Troy */
#define PPC_MSG_CALL_FUNCTION 0
#define PPC_MSG_RESCHEDULE 1
#define PPC_MSG_CALL_FUNC_SINGLE 2
#define PPC_MSG_TICK_BROADCAST 2
#define PPC_MSG_DEBUGGER_BREAK 3
/* for irq controllers that have dedicated ipis per message (4) */
......
......@@ -28,6 +28,7 @@ extern struct clock_event_device decrementer_clockevent;
struct rtc_time;
extern void to_tm(int tim, struct rtc_time * tm);
extern void GregorianDay(struct rtc_time *tm);
extern void tick_broadcast_ipi_handler(void);
extern void generic_calibrate_decr(void);
......
......@@ -121,9 +121,10 @@ BEGIN_FTR_SECTION
cmpwi cr1,r13,2
/* Total loss of HV state is fatal, we could try to use the
* PIR to locate a PACA, then use an emergency stack etc...
* but for now, let's just stay stuck here
* OPAL v3 based powernv platforms have new idle states
* which fall in this catagory.
*/
bgt cr1,.
bgt cr1,8f
GET_PACA(r13)
#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
......@@ -141,6 +142,11 @@ BEGIN_FTR_SECTION
beq cr1,2f
b .power7_wakeup_noloss
2: b .power7_wakeup_loss
/* Fast Sleep wakeup on PowerNV */
8: GET_PACA(r13)
b .power7_wakeup_tb_loss
9:
END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206)
#endif /* CONFIG_PPC_P7_NAP */
......
......@@ -17,20 +17,31 @@
#include <asm/ppc-opcode.h>
#include <asm/hw_irq.h>
#include <asm/kvm_book3s_asm.h>
#include <asm/opal.h>
#undef DEBUG
.text
/* Idle state entry routines */
_GLOBAL(power7_idle)
/* Now check if user or arch enabled NAP mode */
LOAD_REG_ADDRBASE(r3,powersave_nap)
lwz r4,ADDROFF(powersave_nap)(r3)
cmpwi 0,r4,0
beqlr
/* fall through */
#define IDLE_STATE_ENTER_SEQ(IDLE_INST) \
/* Magic NAP/SLEEP/WINKLE mode enter sequence */ \
std r0,0(r1); \
ptesync; \
ld r0,0(r1); \
1: cmp cr0,r0,r0; \
bne 1b; \
IDLE_INST; \
b .
_GLOBAL(power7_nap)
.text
/*
* Pass requested state in r3:
* 0 - nap
* 1 - sleep
*/
_GLOBAL(power7_powersave_common)
/* Use r3 to pass state nap/sleep/winkle */
/* NAP is a state loss, we create a regs frame on the
* stack, fill it up with the state we care about and
* stick a pointer to it in PACAR1. We really only
......@@ -79,8 +90,8 @@ _GLOBAL(power7_nap)
/* Continue saving state */
SAVE_GPR(2, r1)
SAVE_NVGPRS(r1)
mfcr r3
std r3,_CCR(r1)
mfcr r4
std r4,_CCR(r1)
std r9,_MSR(r1)
std r1,PACAR1(r13)
......@@ -90,15 +101,56 @@ _GLOBAL(power7_enter_nap_mode)
li r4,KVM_HWTHREAD_IN_NAP
stb r4,HSTATE_HWTHREAD_STATE(r13)
#endif
cmpwi cr0,r3,1
beq 2f
IDLE_STATE_ENTER_SEQ(PPC_NAP)
/* No return */
2: IDLE_STATE_ENTER_SEQ(PPC_SLEEP)
/* No return */
/* Magic NAP mode enter sequence */
std r0,0(r1)
ptesync
ld r0,0(r1)
1: cmp cr0,r0,r0
bne 1b
PPC_NAP
b .
_GLOBAL(power7_idle)
/* Now check if user or arch enabled NAP mode */
LOAD_REG_ADDRBASE(r3,powersave_nap)
lwz r4,ADDROFF(powersave_nap)(r3)
cmpwi 0,r4,0
beqlr
/* fall through */
_GLOBAL(power7_nap)
li r3,0
b power7_powersave_common
/* No return */
_GLOBAL(power7_sleep)
li r3,1
b power7_powersave_common
/* No return */
_GLOBAL(power7_wakeup_tb_loss)
ld r2,PACATOC(r13);
ld r1,PACAR1(r13)
/* Time base re-sync */
li r0,OPAL_RESYNC_TIMEBASE
LOAD_REG_ADDR(r11,opal);
ld r12,8(r11);
ld r2,0(r11);
mtctr r12
bctrl
/* TODO: Check r3 for failure */
REST_NVGPRS(r1)
REST_GPR(2, r1)
ld r3,_CCR(r1)
ld r4,_MSR(r1)
ld r5,_NIP(r1)
addi r1,r1,INT_FRAME_SIZE
mtcr r3
mfspr r3,SPRN_SRR1 /* Return SRR1 */
mtspr SPRN_SRR1,r4
mtspr SPRN_SRR0,r5
rfid
_GLOBAL(power7_wakeup_loss)
ld r1,PACAR1(r13)
......
......@@ -35,6 +35,7 @@
#include <asm/ptrace.h>
#include <linux/atomic.h>
#include <asm/irq.h>
#include <asm/hw_irq.h>
#include <asm/page.h>
#include <asm/pgtable.h>
#include <asm/prom.h>
......@@ -145,9 +146,9 @@ static irqreturn_t reschedule_action(int irq, void *data)
return IRQ_HANDLED;
}
static irqreturn_t call_function_single_action(int irq, void *data)
static irqreturn_t tick_broadcast_ipi_action(int irq, void *data)
{
generic_smp_call_function_single_interrupt();
tick_broadcast_ipi_handler();
return IRQ_HANDLED;
}
......@@ -168,14 +169,14 @@ static irqreturn_t debug_ipi_action(int irq, void *data)
static irq_handler_t smp_ipi_action[] = {
[PPC_MSG_CALL_FUNCTION] = call_function_action,
[PPC_MSG_RESCHEDULE] = reschedule_action,
[PPC_MSG_CALL_FUNC_SINGLE] = call_function_single_action,
[PPC_MSG_TICK_BROADCAST] = tick_broadcast_ipi_action,
[PPC_MSG_DEBUGGER_BREAK] = debug_ipi_action,
};
const char *smp_ipi_name[] = {
[PPC_MSG_CALL_FUNCTION] = "ipi call function",
[PPC_MSG_RESCHEDULE] = "ipi reschedule",
[PPC_MSG_CALL_FUNC_SINGLE] = "ipi call function single",
[PPC_MSG_TICK_BROADCAST] = "ipi tick-broadcast",
[PPC_MSG_DEBUGGER_BREAK] = "ipi debugger",
};
......@@ -251,8 +252,8 @@ irqreturn_t smp_ipi_demux(void)
generic_smp_call_function_interrupt();
if (all & IPI_MESSAGE(PPC_MSG_RESCHEDULE))
scheduler_ipi();
if (all & IPI_MESSAGE(PPC_MSG_CALL_FUNC_SINGLE))
generic_smp_call_function_single_interrupt();
if (all & IPI_MESSAGE(PPC_MSG_TICK_BROADCAST))
tick_broadcast_ipi_handler();
if (all & IPI_MESSAGE(PPC_MSG_DEBUGGER_BREAK))
debug_ipi_action(0, NULL);
} while (info->messages);
......@@ -280,7 +281,7 @@ EXPORT_SYMBOL_GPL(smp_send_reschedule);
void arch_send_call_function_single_ipi(int cpu)
{
do_message_pass(cpu, PPC_MSG_CALL_FUNC_SINGLE);
do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
}
void arch_send_call_function_ipi_mask(const struct cpumask *mask)
......@@ -291,6 +292,16 @@ void arch_send_call_function_ipi_mask(const struct cpumask *mask)
do_message_pass(cpu, PPC_MSG_CALL_FUNCTION);
}
#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST
void tick_broadcast(const struct cpumask *mask)
{
unsigned int cpu;
for_each_cpu(cpu, mask)
do_message_pass(cpu, PPC_MSG_TICK_BROADCAST);
}
#endif
#if defined(CONFIG_DEBUGGER) || defined(CONFIG_KEXEC)
void smp_send_debugger_break(void)
{
......
......@@ -42,6 +42,7 @@
#include <linux/timex.h>
#include <linux/kernel_stat.h>
#include <linux/time.h>
#include <linux/clockchips.h>
#include <linux/init.h>
#include <linux/profile.h>
#include <linux/cpu.h>
......@@ -106,7 +107,7 @@ struct clock_event_device decrementer_clockevent = {
.irq = 0,
.set_next_event = decrementer_set_next_event,
.set_mode = decrementer_set_mode,
.features = CLOCK_EVT_FEAT_ONESHOT,
.features = CLOCK_EVT_FEAT_ONESHOT | CLOCK_EVT_FEAT_C3STOP,
};
EXPORT_SYMBOL(decrementer_clockevent);
......@@ -478,6 +479,47 @@ void arch_irq_work_raise(void)
#endif /* CONFIG_IRQ_WORK */
void __timer_interrupt(void)
{
struct pt_regs *regs = get_irq_regs();
u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
struct clock_event_device *evt = &__get_cpu_var(decrementers);
u64 now;
trace_timer_interrupt_entry(regs);
if (test_irq_work_pending()) {
clear_irq_work_pending();
irq_work_run();
}
now = get_tb_or_rtc();
if (now >= *next_tb) {
*next_tb = ~(u64)0;
if (evt->event_handler)
evt->event_handler(evt);
__get_cpu_var(irq_stat).timer_irqs_event++;
} else {
now = *next_tb - now;
if (now <= DECREMENTER_MAX)
set_dec((int)now);
/* We may have raced with new irq work */
if (test_irq_work_pending())
set_dec(1);
__get_cpu_var(irq_stat).timer_irqs_others++;
}
#ifdef CONFIG_PPC64
/* collect purr register values often, for accurate calculations */
if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array);
cu->current_tb = mfspr(SPRN_PURR);
}
#endif
trace_timer_interrupt_exit(regs);
}
/*
* timer_interrupt - gets called when the decrementer overflows,
* with interrupts disabled.
......@@ -486,8 +528,6 @@ void timer_interrupt(struct pt_regs * regs)
{
struct pt_regs *old_regs;
u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
struct clock_event_device *evt = &__get_cpu_var(decrementers);
u64 now;
/* Ensure a positive value is written to the decrementer, or else
* some CPUs will continue to take decrementer exceptions.
......@@ -519,39 +559,7 @@ void timer_interrupt(struct pt_regs * regs)
old_regs = set_irq_regs(regs);
irq_enter();
trace_timer_interrupt_entry(regs);
if (test_irq_work_pending()) {
clear_irq_work_pending();
irq_work_run();
}
now = get_tb_or_rtc();
if (now >= *next_tb) {
*next_tb = ~(u64)0;
if (evt->event_handler)
evt->event_handler(evt);
__get_cpu_var(irq_stat).timer_irqs_event++;
} else {
now = *next_tb - now;
if (now <= DECREMENTER_MAX)
set_dec((int)now);
/* We may have raced with new irq work */
if (test_irq_work_pending())
set_dec(1);
__get_cpu_var(irq_stat).timer_irqs_others++;
}
#ifdef CONFIG_PPC64
/* collect purr register values often, for accurate calculations */
if (firmware_has_feature(FW_FEATURE_SPLPAR)) {
struct cpu_usage *cu = &__get_cpu_var(cpu_usage_array);
cu->current_tb = mfspr(SPRN_PURR);
}
#endif
trace_timer_interrupt_exit(regs);
__timer_interrupt();
irq_exit();
set_irq_regs(old_regs);
}
......@@ -825,6 +833,15 @@ static void decrementer_set_mode(enum clock_event_mode mode,
decrementer_set_next_event(DECREMENTER_MAX, dev);
}
/* Interrupt handler for the timer broadcast IPI */
void tick_broadcast_ipi_handler(void)
{
u64 *next_tb = &__get_cpu_var(decrementers_next_tb);
*next_tb = get_tb_or_rtc();
__timer_interrupt();
}
static void register_decrementer_clockevent(int cpu)
{
struct clock_event_device *dec = &per_cpu(decrementers, cpu);
......@@ -928,6 +945,7 @@ void __init time_init(void)
clocksource_init();
init_decrementer_clockevent();
tick_setup_hrtimer_broadcast();
}
......
......@@ -215,7 +215,7 @@ void iic_request_IPIs(void)
{
iic_request_ipi(PPC_MSG_CALL_FUNCTION);
iic_request_ipi(PPC_MSG_RESCHEDULE);
iic_request_ipi(PPC_MSG_CALL_FUNC_SINGLE);
iic_request_ipi(PPC_MSG_TICK_BROADCAST);
iic_request_ipi(PPC_MSG_DEBUGGER_BREAK);
}
......
......@@ -131,6 +131,7 @@ OPAL_CALL(opal_write_elog, OPAL_ELOG_WRITE);
OPAL_CALL(opal_validate_flash, OPAL_FLASH_VALIDATE);
OPAL_CALL(opal_manage_flash, OPAL_FLASH_MANAGE);
OPAL_CALL(opal_update_flash, OPAL_FLASH_UPDATE);
OPAL_CALL(opal_resync_timebase, OPAL_RESYNC_TIMEBASE);
OPAL_CALL(opal_dump_init, OPAL_DUMP_INIT);
OPAL_CALL(opal_dump_info, OPAL_DUMP_INFO);
OPAL_CALL(opal_dump_info2, OPAL_DUMP_INFO2);
......
......@@ -76,7 +76,7 @@ static int __init ps3_smp_probe(void)
BUILD_BUG_ON(PPC_MSG_CALL_FUNCTION != 0);
BUILD_BUG_ON(PPC_MSG_RESCHEDULE != 1);
BUILD_BUG_ON(PPC_MSG_CALL_FUNC_SINGLE != 2);
BUILD_BUG_ON(PPC_MSG_TICK_BROADCAST != 2);
BUILD_BUG_ON(PPC_MSG_DEBUGGER_BREAK != 3);
for (i = 0; i < MSG_COUNT; i++) {
......
......@@ -11,11 +11,19 @@
#include <linux/cpuidle.h>
#include <linux/cpu.h>
#include <linux/notifier.h>
#include <linux/clockchips.h>
#include <linux/of.h>
#include <asm/machdep.h>
#include <asm/firmware.h>
#include <asm/runlatch.h>
/* Flags and constants used in PowerNV platform */
#define MAX_POWERNV_IDLE_STATES 8
#define IDLE_USE_INST_NAP 0x00010000 /* Use nap instruction */
#define IDLE_USE_INST_SLEEP 0x00020000 /* Use sleep instruction */
struct cpuidle_driver powernv_idle_driver = {
.name = "powernv_idle",
.owner = THIS_MODULE,
......@@ -54,10 +62,36 @@ static int nap_loop(struct cpuidle_device *dev,
return index;
}
static int fastsleep_loop(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
int index)
{
unsigned long old_lpcr = mfspr(SPRN_LPCR);
unsigned long new_lpcr;
if (unlikely(system_state < SYSTEM_RUNNING))
return index;
new_lpcr = old_lpcr;
new_lpcr &= ~(LPCR_MER | LPCR_PECE); /* lpcr[mer] must be 0 */
/* exit powersave upon external interrupt, but not decrementer
* interrupt.
*/
new_lpcr |= LPCR_PECE0;
mtspr(SPRN_LPCR, new_lpcr);
power7_sleep();
mtspr(SPRN_LPCR, old_lpcr);
return index;
}
/*
* States for dedicated partition case.
*/
static struct cpuidle_state powernv_states[] = {
static struct cpuidle_state powernv_states[MAX_POWERNV_IDLE_STATES] = {
{ /* Snooze */
.name = "snooze",
.desc = "snooze",
......@@ -65,13 +99,6 @@ static struct cpuidle_state powernv_states[] = {
.exit_latency = 0,
.target_residency = 0,
.enter = &snooze_loop },
{ /* NAP */
.name = "NAP",
.desc = "NAP",
.flags = CPUIDLE_FLAG_TIME_VALID,
.exit_latency = 10,
.target_residency = 100,
.enter = &nap_loop },
};
static int powernv_cpuidle_add_cpu_notifier(struct notifier_block *n,
......@@ -132,19 +159,74 @@ static int powernv_cpuidle_driver_init(void)
return 0;
}
static int powernv_add_idle_states(void)
{
struct device_node *power_mgt;
struct property *prop;
int nr_idle_states = 1; /* Snooze */
int dt_idle_states;
u32 *flags;
int i;
/* Currently we have snooze statically defined */
power_mgt = of_find_node_by_path("/ibm,opal/power-mgt");
if (!power_mgt) {
pr_warn("opal: PowerMgmt Node not found\n");
return nr_idle_states;
}
prop = of_find_property(power_mgt, "ibm,cpu-idle-state-flags", NULL);
if (!prop) {
pr_warn("DT-PowerMgmt: missing ibm,cpu-idle-state-flags\n");
return nr_idle_states;
}
dt_idle_states = prop->length / sizeof(u32);
flags = (u32 *) prop->value;
for (i = 0; i < dt_idle_states; i++) {
if (flags[i] & IDLE_USE_INST_NAP) {
/* Add NAP state */
strcpy(powernv_states[nr_idle_states].name, "Nap");
strcpy(powernv_states[nr_idle_states].desc, "Nap");
powernv_states[nr_idle_states].flags = CPUIDLE_FLAG_TIME_VALID;
powernv_states[nr_idle_states].exit_latency = 10;
powernv_states[nr_idle_states].target_residency = 100;
powernv_states[nr_idle_states].enter = &nap_loop;
nr_idle_states++;
}
if (flags[i] & IDLE_USE_INST_SLEEP) {
/* Add FASTSLEEP state */
strcpy(powernv_states[nr_idle_states].name, "FastSleep");
strcpy(powernv_states[nr_idle_states].desc, "FastSleep");
powernv_states[nr_idle_states].flags =
CPUIDLE_FLAG_TIME_VALID | CPUIDLE_FLAG_TIMER_STOP;
powernv_states[nr_idle_states].exit_latency = 300;
powernv_states[nr_idle_states].target_residency = 1000000;
powernv_states[nr_idle_states].enter = &fastsleep_loop;
nr_idle_states++;
}
}
return nr_idle_states;
}
/*
* powernv_idle_probe()
* Choose state table for shared versus dedicated partition
*/
static int powernv_idle_probe(void)
{
if (cpuidle_disable != IDLE_NO_OVERRIDE)
return -ENODEV;
if (firmware_has_feature(FW_FEATURE_OPALv3)) {
cpuidle_state_table = powernv_states;
max_idle_state = ARRAY_SIZE(powernv_states);
/* Device tree can indicate more idle states */
max_idle_state = powernv_add_idle_states();
} else
return -ENODEV;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment