[perf] arm: Improve performance with barrel shifter.
Instead of using a struct member to hold the last barrel shifter carry output, which is expansive as it needs to be saved/loaded from memory, I now pass the carry around as an INOUT parameter. Using perf anotate really shows high perctile of samples being spent on reading/writing `self.bs_carry_out` Since this is a rather "surgical" changeset, I have made sure to run it against eggvance test suite, mGBA test suite and some games as well. I actually saw better improvements than what the benchmark measured, but 7% is decent enough :) ``` run_60_frames time: [180.18 ms 180.45 ms 180.77 ms] change: [-7.2464% -6.9081% -6.6324%] (p = 0.00 < 0.05) Performance has improved. ``` Former-commit-id: 7cd7105a07aa0b78cab9dc8bbae3682b02b7ab7c Former-commit-id: c68514beb3fa6c34f5f65544acbead21e527dbb0
This commit is contained in:
parent
caf89d8532
commit
20506091cc
8 changed files with 78 additions and 87 deletions
|
@ -111,66 +111,59 @@ impl BarrelShifterValue {
|
|||
}
|
||||
|
||||
impl<I: MemoryInterface> Core<I> {
|
||||
pub fn lsl(&mut self, val: u32, amount: u32, carry_in: bool) -> u32 {
|
||||
pub fn lsl(&mut self, val: u32, amount: u32, carry: &mut bool) -> u32 {
|
||||
match amount {
|
||||
0 => {
|
||||
self.bs_carry_out = carry_in;
|
||||
val
|
||||
}
|
||||
0 => val,
|
||||
x if x < 32 => {
|
||||
self.bs_carry_out = val.wrapping_shr(32 - x) & 1 == 1;
|
||||
*carry = val.wrapping_shr(32 - x) & 1 == 1;
|
||||
val << x
|
||||
}
|
||||
32 => {
|
||||
self.bs_carry_out = val & 1 == 1;
|
||||
*carry = val & 1 == 1;
|
||||
0
|
||||
}
|
||||
_ => {
|
||||
self.bs_carry_out = false;
|
||||
*carry = false;
|
||||
0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pub fn lsr(&mut self, val: u32, amount: u32, carry_in: bool, immediate: bool) -> u32 {
|
||||
pub fn lsr(&mut self, val: u32, amount: u32, carry: &mut bool, immediate: bool) -> u32 {
|
||||
if amount != 0 {
|
||||
match amount {
|
||||
x if x < 32 => {
|
||||
self.bs_carry_out = (val >> (amount - 1) & 1) == 1;
|
||||
*carry = (val >> (amount - 1) & 1) == 1;
|
||||
val >> amount
|
||||
}
|
||||
32 => {
|
||||
self.bs_carry_out = val.bit(31);
|
||||
*carry = val.bit(31);
|
||||
0
|
||||
}
|
||||
_ => {
|
||||
self.bs_carry_out = false;
|
||||
*carry = false;
|
||||
0
|
||||
}
|
||||
}
|
||||
} else if immediate {
|
||||
self.bs_carry_out = val.bit(31);
|
||||
*carry = val.bit(31);
|
||||
0
|
||||
} else {
|
||||
self.bs_carry_out = carry_in;
|
||||
val
|
||||
}
|
||||
}
|
||||
|
||||
pub fn asr(&mut self, val: u32, amount: u32, carry_in: bool, immediate: bool) -> u32 {
|
||||
pub fn asr(&mut self, val: u32, amount: u32, carry: &mut bool, immediate: bool) -> u32 {
|
||||
let amount = if immediate && amount == 0 { 32 } else { amount };
|
||||
match amount {
|
||||
0 => {
|
||||
self.bs_carry_out = carry_in;
|
||||
val
|
||||
}
|
||||
0 => val,
|
||||
x if x < 32 => {
|
||||
self.bs_carry_out = val.wrapping_shr(amount - 1) & 1 == 1;
|
||||
*carry = val.wrapping_shr(amount - 1) & 1 == 1;
|
||||
(val as i32).wrapping_shr(amount) as u32
|
||||
}
|
||||
_ => {
|
||||
let bit31 = val.bit(31);
|
||||
self.bs_carry_out = bit31;
|
||||
*carry = bit31;
|
||||
if bit31 {
|
||||
0xffffffff
|
||||
} else {
|
||||
|
@ -180,9 +173,9 @@ impl<I: MemoryInterface> Core<I> {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn rrx(&mut self, val: u32, carry_in: bool) -> u32 {
|
||||
let old_c = carry_in as i32;
|
||||
self.bs_carry_out = val & 0b1 != 0;
|
||||
pub fn rrx(&mut self, val: u32, carry: &mut bool) -> u32 {
|
||||
let old_c = *carry as i32;
|
||||
*carry = val & 0b1 != 0;
|
||||
(((val as u32) >> 1) as i32 | (old_c << 31)) as u32
|
||||
}
|
||||
|
||||
|
@ -190,14 +183,14 @@ impl<I: MemoryInterface> Core<I> {
|
|||
&mut self,
|
||||
val: u32,
|
||||
amount: u32,
|
||||
carry_in: bool,
|
||||
carry: &mut bool,
|
||||
immediate: bool,
|
||||
rrx: bool,
|
||||
) -> u32 {
|
||||
match amount {
|
||||
0 => {
|
||||
if immediate & rrx {
|
||||
self.rrx(val, carry_in)
|
||||
self.rrx(val, carry)
|
||||
} else {
|
||||
val
|
||||
}
|
||||
|
@ -209,7 +202,7 @@ impl<I: MemoryInterface> Core<I> {
|
|||
} else {
|
||||
val
|
||||
};
|
||||
self.bs_carry_out = (val as u32).bit(31);
|
||||
*carry = (val as u32).bit(31);
|
||||
val
|
||||
}
|
||||
}
|
||||
|
@ -222,12 +215,9 @@ impl<I: MemoryInterface> Core<I> {
|
|||
shift: BarrelShiftOpCode,
|
||||
val: u32,
|
||||
amount: u32,
|
||||
carry_in: bool,
|
||||
carry: &mut bool,
|
||||
immediate: bool,
|
||||
) -> u32 {
|
||||
// TODO get rid of Core::bs_carry_out field in favour sending the carry as a &mut reference,
|
||||
// Forcing calling functions to do something with the carry :
|
||||
self.bs_carry_out = carry_in;
|
||||
//
|
||||
// From GBATEK:
|
||||
// Zero Shift Amount (Shift Register by Immediate, with Immediate=0)
|
||||
|
@ -248,10 +238,10 @@ impl<I: MemoryInterface> Core<I> {
|
|||
// in the range 1 to 32 and see above.
|
||||
//
|
||||
match shift {
|
||||
BarrelShiftOpCode::LSL => self.lsl(val, amount, carry_in),
|
||||
BarrelShiftOpCode::LSR => self.lsr(val, amount, carry_in, immediate),
|
||||
BarrelShiftOpCode::ASR => self.asr(val, amount, carry_in, immediate),
|
||||
BarrelShiftOpCode::ROR => self.ror(val, amount, carry_in, immediate, true),
|
||||
BarrelShiftOpCode::LSL => self.lsl(val, amount, carry),
|
||||
BarrelShiftOpCode::LSR => self.lsr(val, amount, carry, immediate),
|
||||
BarrelShiftOpCode::ASR => self.asr(val, amount, carry, immediate),
|
||||
BarrelShiftOpCode::ROR => self.ror(val, amount, carry, immediate, true),
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -261,7 +251,7 @@ impl<I: MemoryInterface> Core<I> {
|
|||
bs_op: BarrelShiftOpCode,
|
||||
reg: usize,
|
||||
rs: usize,
|
||||
carry: bool,
|
||||
carry: &mut bool,
|
||||
) -> u32 {
|
||||
let mut val = self.get_reg(reg);
|
||||
if reg == REG_PC {
|
||||
|
@ -271,8 +261,7 @@ impl<I: MemoryInterface> Core<I> {
|
|||
self.barrel_shift_op(bs_op, val, amount, carry, false)
|
||||
}
|
||||
|
||||
pub fn register_shift(&mut self, shift: &ShiftedRegister) -> u32 {
|
||||
let carry = self.cpsr.C();
|
||||
pub fn register_shift(&mut self, shift: &ShiftedRegister, carry: &mut bool) -> u32 {
|
||||
match shift.shift_by {
|
||||
ShiftRegisterBy::ByAmount(amount) => {
|
||||
let result =
|
||||
|
@ -285,13 +274,13 @@ impl<I: MemoryInterface> Core<I> {
|
|||
}
|
||||
}
|
||||
|
||||
pub fn get_barrel_shifted_value(&mut self, sval: &BarrelShifterValue) -> u32 {
|
||||
pub fn get_barrel_shifted_value(&mut self, sval: &BarrelShifterValue, carry: &mut bool) -> u32 {
|
||||
// TODO decide if error handling or panic here
|
||||
match sval {
|
||||
BarrelShifterValue::ImmediateValue(offset) => *offset as u32,
|
||||
BarrelShifterValue::ShiftedRegister(shifted_reg) => {
|
||||
let added = (*shifted_reg).added.unwrap_or(true);
|
||||
let abs = self.register_shift(shifted_reg) as u32;
|
||||
let abs = self.register_shift(shifted_reg, carry) as u32;
|
||||
if added {
|
||||
abs as u32
|
||||
} else {
|
||||
|
|
|
@ -93,7 +93,10 @@ impl<I: MemoryInterface> Core<I> {
|
|||
if insn.bit(25) {
|
||||
let immediate = insn & 0xff;
|
||||
let rotate = 2 * insn.bit_range(8..12);
|
||||
self.ror(immediate, rotate, self.cpsr.C(), false, true)
|
||||
let mut carry = self.cpsr.C();
|
||||
let v = self.ror(immediate, rotate, &mut carry, false, true);
|
||||
self.cpsr.set_C(carry);
|
||||
v
|
||||
} else {
|
||||
self.get_reg((insn & 0b1111) as usize)
|
||||
}
|
||||
|
@ -175,13 +178,12 @@ impl<I: MemoryInterface> Core<I> {
|
|||
let mut s_flag = insn.set_cond_flag();
|
||||
let opcode = insn.opcode();
|
||||
|
||||
let mut carry = self.cpsr.C();
|
||||
let op2 = if insn.bit(25) {
|
||||
let immediate = insn & 0xff;
|
||||
let rotate = 2 * insn.bit_range(8..12);
|
||||
// TODO refactor out
|
||||
let bs_carry_in = self.cpsr.C();
|
||||
self.bs_carry_out = bs_carry_in;
|
||||
self.ror(immediate, rotate, self.cpsr.C(), false, true)
|
||||
self.ror(immediate, rotate, &mut carry, false, true)
|
||||
} else {
|
||||
let reg = insn & 0xf;
|
||||
|
||||
|
@ -203,7 +205,7 @@ impl<I: MemoryInterface> Core<I> {
|
|||
shift_by: shift_by,
|
||||
added: None,
|
||||
};
|
||||
self.register_shift(&shifted_reg)
|
||||
self.register_shift(&shifted_reg, &mut carry)
|
||||
};
|
||||
|
||||
if rd == REG_PC && s_flag {
|
||||
|
@ -211,9 +213,7 @@ impl<I: MemoryInterface> Core<I> {
|
|||
s_flag = false;
|
||||
}
|
||||
|
||||
let carry = self.cpsr.C() as u32;
|
||||
let alu_res = if s_flag {
|
||||
let mut carry = self.bs_carry_out;
|
||||
let mut overflow = self.cpsr.V();
|
||||
let result = match opcode {
|
||||
AND | TST => op1 & op2,
|
||||
|
@ -238,15 +238,16 @@ impl<I: MemoryInterface> Core<I> {
|
|||
Some(result)
|
||||
}
|
||||
} else {
|
||||
let c = carry as u32;
|
||||
Some(match opcode {
|
||||
AND => op1 & op2,
|
||||
EOR => op1 ^ op2,
|
||||
SUB => op1.wrapping_sub(op2),
|
||||
RSB => op2.wrapping_sub(op1),
|
||||
ADD => op1.wrapping_add(op2),
|
||||
ADC => op1.wrapping_add(op2).wrapping_add(carry),
|
||||
SBC => op1.wrapping_sub(op2.wrapping_add(1 - carry)),
|
||||
RSC => op2.wrapping_sub(op1.wrapping_add(1 - carry)),
|
||||
ADC => op1.wrapping_add(op2).wrapping_add(c),
|
||||
SBC => op1.wrapping_sub(op2.wrapping_add(1 - c)),
|
||||
RSC => op2.wrapping_sub(op1.wrapping_add(1 - c)),
|
||||
ORR => op1 | op2,
|
||||
MOV => op2,
|
||||
BIC => op1 & (!op2),
|
||||
|
@ -290,7 +291,9 @@ impl<I: MemoryInterface> Core<I> {
|
|||
if base_reg == REG_PC {
|
||||
addr = self.pc_arm() + 8; // prefetching
|
||||
}
|
||||
let offset = self.get_barrel_shifted_value(&insn.ldr_str_offset()); // TODO: wrong to use in here
|
||||
let mut carry = self.cpsr.C();
|
||||
let offset = self.get_barrel_shifted_value(&insn.ldr_str_offset(), &mut carry); // TODO: wrong to use in here
|
||||
self.cpsr.set_C(carry);
|
||||
let effective_addr = (addr as i32).wrapping_add(offset as i32) as Addr;
|
||||
|
||||
// TODO - confirm this
|
||||
|
@ -350,15 +353,16 @@ impl<I: MemoryInterface> Core<I> {
|
|||
}
|
||||
|
||||
pub fn exec_arm_ldr_str_hs_reg(&mut self, insn: u32) -> CpuAction {
|
||||
self.ldr_str_hs(
|
||||
insn,
|
||||
BarrelShifterValue::ShiftedRegister(ShiftedRegister {
|
||||
reg: (insn & 0xf) as usize,
|
||||
shift_by: ShiftRegisterBy::ByAmount(0),
|
||||
bs_op: BarrelShiftOpCode::LSL,
|
||||
added: Some(insn.add_offset_flag()),
|
||||
}),
|
||||
)
|
||||
let offset = {
|
||||
let added = insn.add_offset_flag();
|
||||
let abs = self.get_reg((insn & 0xf) as usize);
|
||||
if added {
|
||||
abs as u32
|
||||
} else {
|
||||
(-(abs as i32)) as u32
|
||||
}
|
||||
};
|
||||
self.ldr_str_hs_common(insn, offset)
|
||||
}
|
||||
|
||||
pub fn exec_arm_ldr_str_hs_imm(&mut self, insn: u32) -> CpuAction {
|
||||
|
@ -368,11 +372,11 @@ impl<I: MemoryInterface> Core<I> {
|
|||
} else {
|
||||
(-(offset8 as i32)) as u32
|
||||
};
|
||||
self.ldr_str_hs(insn, BarrelShifterValue::ImmediateValue(offset8))
|
||||
self.ldr_str_hs_common(insn, offset8)
|
||||
}
|
||||
|
||||
#[inline(always)]
|
||||
pub fn ldr_str_hs(&mut self, insn: u32, offset: BarrelShifterValue) -> CpuAction {
|
||||
pub fn ldr_str_hs_common(&mut self, insn: u32, offset: u32) -> CpuAction {
|
||||
let mut result = CpuAction::AdvancePC(NonSeq);
|
||||
|
||||
let load = insn.load_flag();
|
||||
|
@ -385,8 +389,6 @@ impl<I: MemoryInterface> Core<I> {
|
|||
addr = self.pc_arm() + 8; // prefetching
|
||||
}
|
||||
|
||||
let offset = self.get_barrel_shifted_value(&offset);
|
||||
|
||||
// TODO - confirm this
|
||||
let old_mode = self.cpsr.mode();
|
||||
if !pre_index && writeback {
|
||||
|
|
|
@ -83,8 +83,6 @@ pub struct SavedCpuState {
|
|||
pub(super) spsr: RegPSR,
|
||||
|
||||
pub(super) banks: BankedRegisters,
|
||||
|
||||
pub(super) bs_carry_out: bool,
|
||||
}
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
|
@ -125,9 +123,6 @@ pub struct Core<I: MemoryInterface> {
|
|||
pub cpsr: RegPSR,
|
||||
pub(super) spsr: RegPSR,
|
||||
|
||||
// Todo - do I still need this?
|
||||
pub(super) bs_carry_out: bool,
|
||||
|
||||
pub(super) banks: BankedRegisters,
|
||||
|
||||
#[cfg(feature = "debugger")]
|
||||
|
@ -146,7 +141,6 @@ impl<I: MemoryInterface> Core<I> {
|
|||
cpsr,
|
||||
spsr: Default::default(),
|
||||
banks: BankedRegisters::default(),
|
||||
bs_carry_out: false,
|
||||
|
||||
#[cfg(feature = "debugger")]
|
||||
dbg_info: DebuggerInfo::default(),
|
||||
|
@ -167,7 +161,6 @@ impl<I: MemoryInterface> Core<I> {
|
|||
banks: state.banks,
|
||||
spsr: state.spsr,
|
||||
|
||||
bs_carry_out: state.bs_carry_out,
|
||||
pipeline: state.pipeline,
|
||||
next_fetch_access: state.next_fetch_access,
|
||||
|
||||
|
@ -184,7 +177,6 @@ impl<I: MemoryInterface> Core<I> {
|
|||
gpr: self.gpr.clone(),
|
||||
spsr: self.spsr,
|
||||
banks: self.banks.clone(),
|
||||
bs_carry_out: self.bs_carry_out,
|
||||
pipeline: self.pipeline.clone(),
|
||||
next_fetch_access: self.next_fetch_access,
|
||||
}
|
||||
|
@ -196,7 +188,6 @@ impl<I: MemoryInterface> Core<I> {
|
|||
self.gpr = state.gpr;
|
||||
self.spsr = state.spsr;
|
||||
self.banks = state.banks;
|
||||
self.bs_carry_out = state.bs_carry_out;
|
||||
self.pipeline = state.pipeline;
|
||||
self.next_fetch_access = state.next_fetch_access;
|
||||
}
|
||||
|
|
|
@ -1,8 +1,8 @@
|
|||
use std::fmt;
|
||||
use std::marker::PhantomData;
|
||||
|
||||
use super::InstructionDecoder;
|
||||
use super::Addr;
|
||||
use super::InstructionDecoder;
|
||||
|
||||
pub struct Disassembler<'a, D>
|
||||
where
|
||||
|
|
|
@ -134,7 +134,10 @@ impl<I: MemoryInterface> Core<I> {
|
|||
if addr & 0x3 != 0 {
|
||||
let rotation = (addr & 0x3) << 3;
|
||||
let value = self.load_32(addr & !0x3, access);
|
||||
self.ror(value, rotation, self.cpsr.C(), false, false)
|
||||
let mut carry = self.cpsr.C();
|
||||
let v = self.ror(value, rotation, &mut carry, false, false);
|
||||
self.cpsr.set_C(carry);
|
||||
v
|
||||
} else {
|
||||
self.load_32(addr, access)
|
||||
}
|
||||
|
@ -146,7 +149,10 @@ impl<I: MemoryInterface> Core<I> {
|
|||
if addr & 0x1 != 0 {
|
||||
let rotation = (addr & 0x1) << 3;
|
||||
let value = self.load_16(addr & !0x1, access);
|
||||
self.ror(value as u32, rotation, self.cpsr.C(), false, false)
|
||||
let mut carry = self.cpsr.C();
|
||||
let v = self.ror(value as u32, rotation, &mut carry, false, false);
|
||||
self.cpsr.set_C(carry);
|
||||
v
|
||||
} else {
|
||||
self.load_16(addr, access) as u32
|
||||
}
|
||||
|
|
|
@ -15,15 +15,16 @@ impl<I: MemoryInterface> Core<I> {
|
|||
let rs = insn.bit_range(3..6) as usize;
|
||||
|
||||
let shift_amount = insn.offset5() as u8 as u32;
|
||||
let mut carry = self.cpsr.C();
|
||||
let op2 = self.barrel_shift_op(
|
||||
insn.format1_op(),
|
||||
self.gpr[rs],
|
||||
shift_amount,
|
||||
self.cpsr.C(),
|
||||
&mut carry,
|
||||
true,
|
||||
);
|
||||
self.gpr[rd] = op2;
|
||||
self.alu_update_flags(op2, false, self.bs_carry_out, self.cpsr.V());
|
||||
self.alu_update_flags(op2, false, carry, self.cpsr.V());
|
||||
|
||||
CpuAction::AdvancePC(Seq)
|
||||
}
|
||||
|
@ -95,9 +96,8 @@ impl<I: MemoryInterface> Core<I> {
|
|||
|
||||
macro_rules! shifter_op {
|
||||
($bs_op:expr) => {{
|
||||
let result = self.shift_by_register($bs_op, rd, rs, carry);
|
||||
let result = self.shift_by_register($bs_op, rd, rs, &mut carry);
|
||||
self.idle_cycle();
|
||||
carry = self.bs_carry_out;
|
||||
result
|
||||
}};
|
||||
}
|
||||
|
|
|
@ -336,7 +336,10 @@ impl EepromController {
|
|||
match (src, dst) {
|
||||
// DMA to EEPROM
|
||||
(_, 0x0d000000..=0x0dffffff) => {
|
||||
debug!("caught eeprom dma transfer src={:#x} dst={:#x} count={}", src, dst, count);
|
||||
debug!(
|
||||
"caught eeprom dma transfer src={:#x} dst={:#x} count={}",
|
||||
src, dst, count
|
||||
);
|
||||
let eeprom_type = match count {
|
||||
// Read(11) + 6bit address + stop bit
|
||||
9 => Eeprom512,
|
||||
|
@ -346,7 +349,10 @@ impl EepromController {
|
|||
73 => Eeprom512,
|
||||
// Write(11) + 14bit address + 64bit value + stop bit
|
||||
81 => Eeprom8k,
|
||||
_ => panic!("unexpected bit count ({}) when detecting eeprom size", count)
|
||||
_ => panic!(
|
||||
"unexpected bit count ({}) when detecting eeprom size",
|
||||
count
|
||||
),
|
||||
};
|
||||
info!("detected eeprom type: {:?}", eeprom_type);
|
||||
self.chip.borrow_mut().set_type(eeprom_type);
|
||||
|
@ -356,7 +362,7 @@ impl EepromController {
|
|||
(0x0d000000..=0x0dffffff, _) => {
|
||||
panic!("reading from eeprom when real size is not detected yet is not supported by this emulator")
|
||||
}
|
||||
_ => {/* Not a eeprom dma, doing nothing */}
|
||||
_ => { /* Not a eeprom dma, doing nothing */ }
|
||||
}
|
||||
} else {
|
||||
// this might be a eeprom request, so we need to reset the eeprom state machine if its dirty (due to bad behaving games, or tests roms)
|
||||
|
|
|
@ -291,10 +291,7 @@ impl SoundController {
|
|||
self.resampler.in_freq = self.sample_rate;
|
||||
}
|
||||
self.cycles_per_sample = 512 >> resolution;
|
||||
info!(
|
||||
"bias - setting sample frequency to {}hz",
|
||||
self.sample_rate
|
||||
);
|
||||
info!("bias - setting sample frequency to {}hz", self.sample_rate);
|
||||
// TODO this will not affect the currently scheduled sample event
|
||||
}
|
||||
|
||||
|
|
Reference in a new issue