[perf] arm: Improve performance with barrel shifter.

Instead of using a struct member to hold the last barrel shifter carry output, which is expansive as it needs to be saved/loaded from memory, I now pass the carry around as an INOUT parameter.
Using perf anotate really shows high perctile of samples being spent on reading/writing `self.bs_carry_out`

Since this is a rather "surgical" changeset, I have made sure to run it against eggvance test suite, mGBA test suite and some games as well.

I actually saw better improvements than what the benchmark measured, but 7% is decent enough :)

```
run_60_frames           time:   [180.18 ms 180.45 ms 180.77 ms]
                        change: [-7.2464% -6.9081% -6.6324%] (p = 0.00 < 0.05)
                        Performance has improved.
```


Former-commit-id: 7cd7105a07aa0b78cab9dc8bbae3682b02b7ab7c
Former-commit-id: c68514beb3fa6c34f5f65544acbead21e527dbb0
This commit is contained in:
Michel Heily 2021-06-05 18:54:46 +03:00
parent caf89d8532
commit 20506091cc
8 changed files with 78 additions and 87 deletions

View file

@ -111,66 +111,59 @@ impl BarrelShifterValue {
}
impl<I: MemoryInterface> Core<I> {
pub fn lsl(&mut self, val: u32, amount: u32, carry_in: bool) -> u32 {
pub fn lsl(&mut self, val: u32, amount: u32, carry: &mut bool) -> u32 {
match amount {
0 => {
self.bs_carry_out = carry_in;
val
}
0 => val,
x if x < 32 => {
self.bs_carry_out = val.wrapping_shr(32 - x) & 1 == 1;
*carry = val.wrapping_shr(32 - x) & 1 == 1;
val << x
}
32 => {
self.bs_carry_out = val & 1 == 1;
*carry = val & 1 == 1;
0
}
_ => {
self.bs_carry_out = false;
*carry = false;
0
}
}
}
pub fn lsr(&mut self, val: u32, amount: u32, carry_in: bool, immediate: bool) -> u32 {
pub fn lsr(&mut self, val: u32, amount: u32, carry: &mut bool, immediate: bool) -> u32 {
if amount != 0 {
match amount {
x if x < 32 => {
self.bs_carry_out = (val >> (amount - 1) & 1) == 1;
*carry = (val >> (amount - 1) & 1) == 1;
val >> amount
}
32 => {
self.bs_carry_out = val.bit(31);
*carry = val.bit(31);
0
}
_ => {
self.bs_carry_out = false;
*carry = false;
0
}
}
} else if immediate {
self.bs_carry_out = val.bit(31);
*carry = val.bit(31);
0
} else {
self.bs_carry_out = carry_in;
val
}
}
pub fn asr(&mut self, val: u32, amount: u32, carry_in: bool, immediate: bool) -> u32 {
pub fn asr(&mut self, val: u32, amount: u32, carry: &mut bool, immediate: bool) -> u32 {
let amount = if immediate && amount == 0 { 32 } else { amount };
match amount {
0 => {
self.bs_carry_out = carry_in;
val
}
0 => val,
x if x < 32 => {
self.bs_carry_out = val.wrapping_shr(amount - 1) & 1 == 1;
*carry = val.wrapping_shr(amount - 1) & 1 == 1;
(val as i32).wrapping_shr(amount) as u32
}
_ => {
let bit31 = val.bit(31);
self.bs_carry_out = bit31;
*carry = bit31;
if bit31 {
0xffffffff
} else {
@ -180,9 +173,9 @@ impl<I: MemoryInterface> Core<I> {
}
}
pub fn rrx(&mut self, val: u32, carry_in: bool) -> u32 {
let old_c = carry_in as i32;
self.bs_carry_out = val & 0b1 != 0;
pub fn rrx(&mut self, val: u32, carry: &mut bool) -> u32 {
let old_c = *carry as i32;
*carry = val & 0b1 != 0;
(((val as u32) >> 1) as i32 | (old_c << 31)) as u32
}
@ -190,14 +183,14 @@ impl<I: MemoryInterface> Core<I> {
&mut self,
val: u32,
amount: u32,
carry_in: bool,
carry: &mut bool,
immediate: bool,
rrx: bool,
) -> u32 {
match amount {
0 => {
if immediate & rrx {
self.rrx(val, carry_in)
self.rrx(val, carry)
} else {
val
}
@ -209,7 +202,7 @@ impl<I: MemoryInterface> Core<I> {
} else {
val
};
self.bs_carry_out = (val as u32).bit(31);
*carry = (val as u32).bit(31);
val
}
}
@ -222,12 +215,9 @@ impl<I: MemoryInterface> Core<I> {
shift: BarrelShiftOpCode,
val: u32,
amount: u32,
carry_in: bool,
carry: &mut bool,
immediate: bool,
) -> u32 {
// TODO get rid of Core::bs_carry_out field in favour sending the carry as a &mut reference,
// Forcing calling functions to do something with the carry :
self.bs_carry_out = carry_in;
//
// From GBATEK:
// Zero Shift Amount (Shift Register by Immediate, with Immediate=0)
@ -248,10 +238,10 @@ impl<I: MemoryInterface> Core<I> {
// in the range 1 to 32 and see above.
//
match shift {
BarrelShiftOpCode::LSL => self.lsl(val, amount, carry_in),
BarrelShiftOpCode::LSR => self.lsr(val, amount, carry_in, immediate),
BarrelShiftOpCode::ASR => self.asr(val, amount, carry_in, immediate),
BarrelShiftOpCode::ROR => self.ror(val, amount, carry_in, immediate, true),
BarrelShiftOpCode::LSL => self.lsl(val, amount, carry),
BarrelShiftOpCode::LSR => self.lsr(val, amount, carry, immediate),
BarrelShiftOpCode::ASR => self.asr(val, amount, carry, immediate),
BarrelShiftOpCode::ROR => self.ror(val, amount, carry, immediate, true),
}
}
@ -261,7 +251,7 @@ impl<I: MemoryInterface> Core<I> {
bs_op: BarrelShiftOpCode,
reg: usize,
rs: usize,
carry: bool,
carry: &mut bool,
) -> u32 {
let mut val = self.get_reg(reg);
if reg == REG_PC {
@ -271,8 +261,7 @@ impl<I: MemoryInterface> Core<I> {
self.barrel_shift_op(bs_op, val, amount, carry, false)
}
pub fn register_shift(&mut self, shift: &ShiftedRegister) -> u32 {
let carry = self.cpsr.C();
pub fn register_shift(&mut self, shift: &ShiftedRegister, carry: &mut bool) -> u32 {
match shift.shift_by {
ShiftRegisterBy::ByAmount(amount) => {
let result =
@ -285,13 +274,13 @@ impl<I: MemoryInterface> Core<I> {
}
}
pub fn get_barrel_shifted_value(&mut self, sval: &BarrelShifterValue) -> u32 {
pub fn get_barrel_shifted_value(&mut self, sval: &BarrelShifterValue, carry: &mut bool) -> u32 {
// TODO decide if error handling or panic here
match sval {
BarrelShifterValue::ImmediateValue(offset) => *offset as u32,
BarrelShifterValue::ShiftedRegister(shifted_reg) => {
let added = (*shifted_reg).added.unwrap_or(true);
let abs = self.register_shift(shifted_reg) as u32;
let abs = self.register_shift(shifted_reg, carry) as u32;
if added {
abs as u32
} else {

View file

@ -93,7 +93,10 @@ impl<I: MemoryInterface> Core<I> {
if insn.bit(25) {
let immediate = insn & 0xff;
let rotate = 2 * insn.bit_range(8..12);
self.ror(immediate, rotate, self.cpsr.C(), false, true)
let mut carry = self.cpsr.C();
let v = self.ror(immediate, rotate, &mut carry, false, true);
self.cpsr.set_C(carry);
v
} else {
self.get_reg((insn & 0b1111) as usize)
}
@ -175,13 +178,12 @@ impl<I: MemoryInterface> Core<I> {
let mut s_flag = insn.set_cond_flag();
let opcode = insn.opcode();
let mut carry = self.cpsr.C();
let op2 = if insn.bit(25) {
let immediate = insn & 0xff;
let rotate = 2 * insn.bit_range(8..12);
// TODO refactor out
let bs_carry_in = self.cpsr.C();
self.bs_carry_out = bs_carry_in;
self.ror(immediate, rotate, self.cpsr.C(), false, true)
self.ror(immediate, rotate, &mut carry, false, true)
} else {
let reg = insn & 0xf;
@ -203,7 +205,7 @@ impl<I: MemoryInterface> Core<I> {
shift_by: shift_by,
added: None,
};
self.register_shift(&shifted_reg)
self.register_shift(&shifted_reg, &mut carry)
};
if rd == REG_PC && s_flag {
@ -211,9 +213,7 @@ impl<I: MemoryInterface> Core<I> {
s_flag = false;
}
let carry = self.cpsr.C() as u32;
let alu_res = if s_flag {
let mut carry = self.bs_carry_out;
let mut overflow = self.cpsr.V();
let result = match opcode {
AND | TST => op1 & op2,
@ -238,15 +238,16 @@ impl<I: MemoryInterface> Core<I> {
Some(result)
}
} else {
let c = carry as u32;
Some(match opcode {
AND => op1 & op2,
EOR => op1 ^ op2,
SUB => op1.wrapping_sub(op2),
RSB => op2.wrapping_sub(op1),
ADD => op1.wrapping_add(op2),
ADC => op1.wrapping_add(op2).wrapping_add(carry),
SBC => op1.wrapping_sub(op2.wrapping_add(1 - carry)),
RSC => op2.wrapping_sub(op1.wrapping_add(1 - carry)),
ADC => op1.wrapping_add(op2).wrapping_add(c),
SBC => op1.wrapping_sub(op2.wrapping_add(1 - c)),
RSC => op2.wrapping_sub(op1.wrapping_add(1 - c)),
ORR => op1 | op2,
MOV => op2,
BIC => op1 & (!op2),
@ -290,7 +291,9 @@ impl<I: MemoryInterface> Core<I> {
if base_reg == REG_PC {
addr = self.pc_arm() + 8; // prefetching
}
let offset = self.get_barrel_shifted_value(&insn.ldr_str_offset()); // TODO: wrong to use in here
let mut carry = self.cpsr.C();
let offset = self.get_barrel_shifted_value(&insn.ldr_str_offset(), &mut carry); // TODO: wrong to use in here
self.cpsr.set_C(carry);
let effective_addr = (addr as i32).wrapping_add(offset as i32) as Addr;
// TODO - confirm this
@ -350,15 +353,16 @@ impl<I: MemoryInterface> Core<I> {
}
pub fn exec_arm_ldr_str_hs_reg(&mut self, insn: u32) -> CpuAction {
self.ldr_str_hs(
insn,
BarrelShifterValue::ShiftedRegister(ShiftedRegister {
reg: (insn & 0xf) as usize,
shift_by: ShiftRegisterBy::ByAmount(0),
bs_op: BarrelShiftOpCode::LSL,
added: Some(insn.add_offset_flag()),
}),
)
let offset = {
let added = insn.add_offset_flag();
let abs = self.get_reg((insn & 0xf) as usize);
if added {
abs as u32
} else {
(-(abs as i32)) as u32
}
};
self.ldr_str_hs_common(insn, offset)
}
pub fn exec_arm_ldr_str_hs_imm(&mut self, insn: u32) -> CpuAction {
@ -368,11 +372,11 @@ impl<I: MemoryInterface> Core<I> {
} else {
(-(offset8 as i32)) as u32
};
self.ldr_str_hs(insn, BarrelShifterValue::ImmediateValue(offset8))
self.ldr_str_hs_common(insn, offset8)
}
#[inline(always)]
pub fn ldr_str_hs(&mut self, insn: u32, offset: BarrelShifterValue) -> CpuAction {
pub fn ldr_str_hs_common(&mut self, insn: u32, offset: u32) -> CpuAction {
let mut result = CpuAction::AdvancePC(NonSeq);
let load = insn.load_flag();
@ -385,8 +389,6 @@ impl<I: MemoryInterface> Core<I> {
addr = self.pc_arm() + 8; // prefetching
}
let offset = self.get_barrel_shifted_value(&offset);
// TODO - confirm this
let old_mode = self.cpsr.mode();
if !pre_index && writeback {

View file

@ -83,8 +83,6 @@ pub struct SavedCpuState {
pub(super) spsr: RegPSR,
pub(super) banks: BankedRegisters,
pub(super) bs_carry_out: bool,
}
#[derive(Clone, Debug)]
@ -125,9 +123,6 @@ pub struct Core<I: MemoryInterface> {
pub cpsr: RegPSR,
pub(super) spsr: RegPSR,
// Todo - do I still need this?
pub(super) bs_carry_out: bool,
pub(super) banks: BankedRegisters,
#[cfg(feature = "debugger")]
@ -146,7 +141,6 @@ impl<I: MemoryInterface> Core<I> {
cpsr,
spsr: Default::default(),
banks: BankedRegisters::default(),
bs_carry_out: false,
#[cfg(feature = "debugger")]
dbg_info: DebuggerInfo::default(),
@ -167,7 +161,6 @@ impl<I: MemoryInterface> Core<I> {
banks: state.banks,
spsr: state.spsr,
bs_carry_out: state.bs_carry_out,
pipeline: state.pipeline,
next_fetch_access: state.next_fetch_access,
@ -184,7 +177,6 @@ impl<I: MemoryInterface> Core<I> {
gpr: self.gpr.clone(),
spsr: self.spsr,
banks: self.banks.clone(),
bs_carry_out: self.bs_carry_out,
pipeline: self.pipeline.clone(),
next_fetch_access: self.next_fetch_access,
}
@ -196,7 +188,6 @@ impl<I: MemoryInterface> Core<I> {
self.gpr = state.gpr;
self.spsr = state.spsr;
self.banks = state.banks;
self.bs_carry_out = state.bs_carry_out;
self.pipeline = state.pipeline;
self.next_fetch_access = state.next_fetch_access;
}

View file

@ -1,8 +1,8 @@
use std::fmt;
use std::marker::PhantomData;
use super::InstructionDecoder;
use super::Addr;
use super::InstructionDecoder;
pub struct Disassembler<'a, D>
where

View file

@ -134,7 +134,10 @@ impl<I: MemoryInterface> Core<I> {
if addr & 0x3 != 0 {
let rotation = (addr & 0x3) << 3;
let value = self.load_32(addr & !0x3, access);
self.ror(value, rotation, self.cpsr.C(), false, false)
let mut carry = self.cpsr.C();
let v = self.ror(value, rotation, &mut carry, false, false);
self.cpsr.set_C(carry);
v
} else {
self.load_32(addr, access)
}
@ -146,7 +149,10 @@ impl<I: MemoryInterface> Core<I> {
if addr & 0x1 != 0 {
let rotation = (addr & 0x1) << 3;
let value = self.load_16(addr & !0x1, access);
self.ror(value as u32, rotation, self.cpsr.C(), false, false)
let mut carry = self.cpsr.C();
let v = self.ror(value as u32, rotation, &mut carry, false, false);
self.cpsr.set_C(carry);
v
} else {
self.load_16(addr, access) as u32
}

View file

@ -15,15 +15,16 @@ impl<I: MemoryInterface> Core<I> {
let rs = insn.bit_range(3..6) as usize;
let shift_amount = insn.offset5() as u8 as u32;
let mut carry = self.cpsr.C();
let op2 = self.barrel_shift_op(
insn.format1_op(),
self.gpr[rs],
shift_amount,
self.cpsr.C(),
&mut carry,
true,
);
self.gpr[rd] = op2;
self.alu_update_flags(op2, false, self.bs_carry_out, self.cpsr.V());
self.alu_update_flags(op2, false, carry, self.cpsr.V());
CpuAction::AdvancePC(Seq)
}
@ -95,9 +96,8 @@ impl<I: MemoryInterface> Core<I> {
macro_rules! shifter_op {
($bs_op:expr) => {{
let result = self.shift_by_register($bs_op, rd, rs, carry);
let result = self.shift_by_register($bs_op, rd, rs, &mut carry);
self.idle_cycle();
carry = self.bs_carry_out;
result
}};
}

View file

@ -336,7 +336,10 @@ impl EepromController {
match (src, dst) {
// DMA to EEPROM
(_, 0x0d000000..=0x0dffffff) => {
debug!("caught eeprom dma transfer src={:#x} dst={:#x} count={}", src, dst, count);
debug!(
"caught eeprom dma transfer src={:#x} dst={:#x} count={}",
src, dst, count
);
let eeprom_type = match count {
// Read(11) + 6bit address + stop bit
9 => Eeprom512,
@ -346,7 +349,10 @@ impl EepromController {
73 => Eeprom512,
// Write(11) + 14bit address + 64bit value + stop bit
81 => Eeprom8k,
_ => panic!("unexpected bit count ({}) when detecting eeprom size", count)
_ => panic!(
"unexpected bit count ({}) when detecting eeprom size",
count
),
};
info!("detected eeprom type: {:?}", eeprom_type);
self.chip.borrow_mut().set_type(eeprom_type);
@ -356,7 +362,7 @@ impl EepromController {
(0x0d000000..=0x0dffffff, _) => {
panic!("reading from eeprom when real size is not detected yet is not supported by this emulator")
}
_ => {/* Not a eeprom dma, doing nothing */}
_ => { /* Not a eeprom dma, doing nothing */ }
}
} else {
// this might be a eeprom request, so we need to reset the eeprom state machine if its dirty (due to bad behaving games, or tests roms)

View file

@ -291,10 +291,7 @@ impl SoundController {
self.resampler.in_freq = self.sample_rate;
}
self.cycles_per_sample = 512 >> resolution;
info!(
"bias - setting sample frequency to {}hz",
self.sample_rate
);
info!("bias - setting sample frequency to {}hz", self.sample_rate);
// TODO this will not affect the currently scheduled sample event
}