From 20506091cc5bdfe8c54f954b32bdb631086d1764 Mon Sep 17 00:00:00 2001 From: Michel Heily Date: Sat, 5 Jun 2021 18:54:46 +0300 Subject: [PATCH] [perf] arm: Improve performance with barrel shifter. Instead of using a struct member to hold the last barrel shifter carry output, which is expansive as it needs to be saved/loaded from memory, I now pass the carry around as an INOUT parameter. Using perf anotate really shows high perctile of samples being spent on reading/writing `self.bs_carry_out` Since this is a rather "surgical" changeset, I have made sure to run it against eggvance test suite, mGBA test suite and some games as well. I actually saw better improvements than what the benchmark measured, but 7% is decent enough :) ``` run_60_frames time: [180.18 ms 180.45 ms 180.77 ms] change: [-7.2464% -6.9081% -6.6324%] (p = 0.00 < 0.05) Performance has improved. ``` Former-commit-id: 7cd7105a07aa0b78cab9dc8bbae3682b02b7ab7c Former-commit-id: c68514beb3fa6c34f5f65544acbead21e527dbb0 --- core/src/arm7tdmi/alu.rs | 69 ++++++++++++----------------- core/src/arm7tdmi/arm/exec.rs | 50 +++++++++++---------- core/src/arm7tdmi/cpu.rs | 9 ---- core/src/arm7tdmi/disass.rs | 2 +- core/src/arm7tdmi/memory.rs | 10 ++++- core/src/arm7tdmi/thumb/exec.rs | 8 ++-- core/src/cartridge/backup/eeprom.rs | 12 +++-- core/src/sound/mod.rs | 5 +-- 8 files changed, 78 insertions(+), 87 deletions(-) diff --git a/core/src/arm7tdmi/alu.rs b/core/src/arm7tdmi/alu.rs index 9ac03db..77fb66d 100644 --- a/core/src/arm7tdmi/alu.rs +++ b/core/src/arm7tdmi/alu.rs @@ -111,66 +111,59 @@ impl BarrelShifterValue { } impl Core { - pub fn lsl(&mut self, val: u32, amount: u32, carry_in: bool) -> u32 { + pub fn lsl(&mut self, val: u32, amount: u32, carry: &mut bool) -> u32 { match amount { - 0 => { - self.bs_carry_out = carry_in; - val - } + 0 => val, x if x < 32 => { - self.bs_carry_out = val.wrapping_shr(32 - x) & 1 == 1; + *carry = val.wrapping_shr(32 - x) & 1 == 1; val << x } 32 => { - self.bs_carry_out = val & 1 == 1; + *carry = val & 1 == 1; 0 } _ => { - self.bs_carry_out = false; + *carry = false; 0 } } } - pub fn lsr(&mut self, val: u32, amount: u32, carry_in: bool, immediate: bool) -> u32 { + pub fn lsr(&mut self, val: u32, amount: u32, carry: &mut bool, immediate: bool) -> u32 { if amount != 0 { match amount { x if x < 32 => { - self.bs_carry_out = (val >> (amount - 1) & 1) == 1; + *carry = (val >> (amount - 1) & 1) == 1; val >> amount } 32 => { - self.bs_carry_out = val.bit(31); + *carry = val.bit(31); 0 } _ => { - self.bs_carry_out = false; + *carry = false; 0 } } } else if immediate { - self.bs_carry_out = val.bit(31); + *carry = val.bit(31); 0 } else { - self.bs_carry_out = carry_in; val } } - pub fn asr(&mut self, val: u32, amount: u32, carry_in: bool, immediate: bool) -> u32 { + pub fn asr(&mut self, val: u32, amount: u32, carry: &mut bool, immediate: bool) -> u32 { let amount = if immediate && amount == 0 { 32 } else { amount }; match amount { - 0 => { - self.bs_carry_out = carry_in; - val - } + 0 => val, x if x < 32 => { - self.bs_carry_out = val.wrapping_shr(amount - 1) & 1 == 1; + *carry = val.wrapping_shr(amount - 1) & 1 == 1; (val as i32).wrapping_shr(amount) as u32 } _ => { let bit31 = val.bit(31); - self.bs_carry_out = bit31; + *carry = bit31; if bit31 { 0xffffffff } else { @@ -180,9 +173,9 @@ impl Core { } } - pub fn rrx(&mut self, val: u32, carry_in: bool) -> u32 { - let old_c = carry_in as i32; - self.bs_carry_out = val & 0b1 != 0; + pub fn rrx(&mut self, val: u32, carry: &mut bool) -> u32 { + let old_c = *carry as i32; + *carry = val & 0b1 != 0; (((val as u32) >> 1) as i32 | (old_c << 31)) as u32 } @@ -190,14 +183,14 @@ impl Core { &mut self, val: u32, amount: u32, - carry_in: bool, + carry: &mut bool, immediate: bool, rrx: bool, ) -> u32 { match amount { 0 => { if immediate & rrx { - self.rrx(val, carry_in) + self.rrx(val, carry) } else { val } @@ -209,7 +202,7 @@ impl Core { } else { val }; - self.bs_carry_out = (val as u32).bit(31); + *carry = (val as u32).bit(31); val } } @@ -222,12 +215,9 @@ impl Core { shift: BarrelShiftOpCode, val: u32, amount: u32, - carry_in: bool, + carry: &mut bool, immediate: bool, ) -> u32 { - // TODO get rid of Core::bs_carry_out field in favour sending the carry as a &mut reference, - // Forcing calling functions to do something with the carry : - self.bs_carry_out = carry_in; // // From GBATEK: // Zero Shift Amount (Shift Register by Immediate, with Immediate=0) @@ -248,10 +238,10 @@ impl Core { // in the range 1 to 32 and see above. // match shift { - BarrelShiftOpCode::LSL => self.lsl(val, amount, carry_in), - BarrelShiftOpCode::LSR => self.lsr(val, amount, carry_in, immediate), - BarrelShiftOpCode::ASR => self.asr(val, amount, carry_in, immediate), - BarrelShiftOpCode::ROR => self.ror(val, amount, carry_in, immediate, true), + BarrelShiftOpCode::LSL => self.lsl(val, amount, carry), + BarrelShiftOpCode::LSR => self.lsr(val, amount, carry, immediate), + BarrelShiftOpCode::ASR => self.asr(val, amount, carry, immediate), + BarrelShiftOpCode::ROR => self.ror(val, amount, carry, immediate, true), } } @@ -261,7 +251,7 @@ impl Core { bs_op: BarrelShiftOpCode, reg: usize, rs: usize, - carry: bool, + carry: &mut bool, ) -> u32 { let mut val = self.get_reg(reg); if reg == REG_PC { @@ -271,8 +261,7 @@ impl Core { self.barrel_shift_op(bs_op, val, amount, carry, false) } - pub fn register_shift(&mut self, shift: &ShiftedRegister) -> u32 { - let carry = self.cpsr.C(); + pub fn register_shift(&mut self, shift: &ShiftedRegister, carry: &mut bool) -> u32 { match shift.shift_by { ShiftRegisterBy::ByAmount(amount) => { let result = @@ -285,13 +274,13 @@ impl Core { } } - pub fn get_barrel_shifted_value(&mut self, sval: &BarrelShifterValue) -> u32 { + pub fn get_barrel_shifted_value(&mut self, sval: &BarrelShifterValue, carry: &mut bool) -> u32 { // TODO decide if error handling or panic here match sval { BarrelShifterValue::ImmediateValue(offset) => *offset as u32, BarrelShifterValue::ShiftedRegister(shifted_reg) => { let added = (*shifted_reg).added.unwrap_or(true); - let abs = self.register_shift(shifted_reg) as u32; + let abs = self.register_shift(shifted_reg, carry) as u32; if added { abs as u32 } else { diff --git a/core/src/arm7tdmi/arm/exec.rs b/core/src/arm7tdmi/arm/exec.rs index fccad62..b893852 100644 --- a/core/src/arm7tdmi/arm/exec.rs +++ b/core/src/arm7tdmi/arm/exec.rs @@ -93,7 +93,10 @@ impl Core { if insn.bit(25) { let immediate = insn & 0xff; let rotate = 2 * insn.bit_range(8..12); - self.ror(immediate, rotate, self.cpsr.C(), false, true) + let mut carry = self.cpsr.C(); + let v = self.ror(immediate, rotate, &mut carry, false, true); + self.cpsr.set_C(carry); + v } else { self.get_reg((insn & 0b1111) as usize) } @@ -175,13 +178,12 @@ impl Core { let mut s_flag = insn.set_cond_flag(); let opcode = insn.opcode(); + let mut carry = self.cpsr.C(); let op2 = if insn.bit(25) { let immediate = insn & 0xff; let rotate = 2 * insn.bit_range(8..12); // TODO refactor out - let bs_carry_in = self.cpsr.C(); - self.bs_carry_out = bs_carry_in; - self.ror(immediate, rotate, self.cpsr.C(), false, true) + self.ror(immediate, rotate, &mut carry, false, true) } else { let reg = insn & 0xf; @@ -203,7 +205,7 @@ impl Core { shift_by: shift_by, added: None, }; - self.register_shift(&shifted_reg) + self.register_shift(&shifted_reg, &mut carry) }; if rd == REG_PC && s_flag { @@ -211,9 +213,7 @@ impl Core { s_flag = false; } - let carry = self.cpsr.C() as u32; let alu_res = if s_flag { - let mut carry = self.bs_carry_out; let mut overflow = self.cpsr.V(); let result = match opcode { AND | TST => op1 & op2, @@ -238,15 +238,16 @@ impl Core { Some(result) } } else { + let c = carry as u32; Some(match opcode { AND => op1 & op2, EOR => op1 ^ op2, SUB => op1.wrapping_sub(op2), RSB => op2.wrapping_sub(op1), ADD => op1.wrapping_add(op2), - ADC => op1.wrapping_add(op2).wrapping_add(carry), - SBC => op1.wrapping_sub(op2.wrapping_add(1 - carry)), - RSC => op2.wrapping_sub(op1.wrapping_add(1 - carry)), + ADC => op1.wrapping_add(op2).wrapping_add(c), + SBC => op1.wrapping_sub(op2.wrapping_add(1 - c)), + RSC => op2.wrapping_sub(op1.wrapping_add(1 - c)), ORR => op1 | op2, MOV => op2, BIC => op1 & (!op2), @@ -290,7 +291,9 @@ impl Core { if base_reg == REG_PC { addr = self.pc_arm() + 8; // prefetching } - let offset = self.get_barrel_shifted_value(&insn.ldr_str_offset()); // TODO: wrong to use in here + let mut carry = self.cpsr.C(); + let offset = self.get_barrel_shifted_value(&insn.ldr_str_offset(), &mut carry); // TODO: wrong to use in here + self.cpsr.set_C(carry); let effective_addr = (addr as i32).wrapping_add(offset as i32) as Addr; // TODO - confirm this @@ -350,15 +353,16 @@ impl Core { } pub fn exec_arm_ldr_str_hs_reg(&mut self, insn: u32) -> CpuAction { - self.ldr_str_hs( - insn, - BarrelShifterValue::ShiftedRegister(ShiftedRegister { - reg: (insn & 0xf) as usize, - shift_by: ShiftRegisterBy::ByAmount(0), - bs_op: BarrelShiftOpCode::LSL, - added: Some(insn.add_offset_flag()), - }), - ) + let offset = { + let added = insn.add_offset_flag(); + let abs = self.get_reg((insn & 0xf) as usize); + if added { + abs as u32 + } else { + (-(abs as i32)) as u32 + } + }; + self.ldr_str_hs_common(insn, offset) } pub fn exec_arm_ldr_str_hs_imm(&mut self, insn: u32) -> CpuAction { @@ -368,11 +372,11 @@ impl Core { } else { (-(offset8 as i32)) as u32 }; - self.ldr_str_hs(insn, BarrelShifterValue::ImmediateValue(offset8)) + self.ldr_str_hs_common(insn, offset8) } #[inline(always)] - pub fn ldr_str_hs(&mut self, insn: u32, offset: BarrelShifterValue) -> CpuAction { + pub fn ldr_str_hs_common(&mut self, insn: u32, offset: u32) -> CpuAction { let mut result = CpuAction::AdvancePC(NonSeq); let load = insn.load_flag(); @@ -385,8 +389,6 @@ impl Core { addr = self.pc_arm() + 8; // prefetching } - let offset = self.get_barrel_shifted_value(&offset); - // TODO - confirm this let old_mode = self.cpsr.mode(); if !pre_index && writeback { diff --git a/core/src/arm7tdmi/cpu.rs b/core/src/arm7tdmi/cpu.rs index afe8f3e..826484f 100644 --- a/core/src/arm7tdmi/cpu.rs +++ b/core/src/arm7tdmi/cpu.rs @@ -83,8 +83,6 @@ pub struct SavedCpuState { pub(super) spsr: RegPSR, pub(super) banks: BankedRegisters, - - pub(super) bs_carry_out: bool, } #[derive(Clone, Debug)] @@ -125,9 +123,6 @@ pub struct Core { pub cpsr: RegPSR, pub(super) spsr: RegPSR, - // Todo - do I still need this? - pub(super) bs_carry_out: bool, - pub(super) banks: BankedRegisters, #[cfg(feature = "debugger")] @@ -146,7 +141,6 @@ impl Core { cpsr, spsr: Default::default(), banks: BankedRegisters::default(), - bs_carry_out: false, #[cfg(feature = "debugger")] dbg_info: DebuggerInfo::default(), @@ -167,7 +161,6 @@ impl Core { banks: state.banks, spsr: state.spsr, - bs_carry_out: state.bs_carry_out, pipeline: state.pipeline, next_fetch_access: state.next_fetch_access, @@ -184,7 +177,6 @@ impl Core { gpr: self.gpr.clone(), spsr: self.spsr, banks: self.banks.clone(), - bs_carry_out: self.bs_carry_out, pipeline: self.pipeline.clone(), next_fetch_access: self.next_fetch_access, } @@ -196,7 +188,6 @@ impl Core { self.gpr = state.gpr; self.spsr = state.spsr; self.banks = state.banks; - self.bs_carry_out = state.bs_carry_out; self.pipeline = state.pipeline; self.next_fetch_access = state.next_fetch_access; } diff --git a/core/src/arm7tdmi/disass.rs b/core/src/arm7tdmi/disass.rs index 1db4d5f..bae98e4 100644 --- a/core/src/arm7tdmi/disass.rs +++ b/core/src/arm7tdmi/disass.rs @@ -1,8 +1,8 @@ use std::fmt; use std::marker::PhantomData; -use super::InstructionDecoder; use super::Addr; +use super::InstructionDecoder; pub struct Disassembler<'a, D> where diff --git a/core/src/arm7tdmi/memory.rs b/core/src/arm7tdmi/memory.rs index 3ac86f3..cd14079 100644 --- a/core/src/arm7tdmi/memory.rs +++ b/core/src/arm7tdmi/memory.rs @@ -134,7 +134,10 @@ impl Core { if addr & 0x3 != 0 { let rotation = (addr & 0x3) << 3; let value = self.load_32(addr & !0x3, access); - self.ror(value, rotation, self.cpsr.C(), false, false) + let mut carry = self.cpsr.C(); + let v = self.ror(value, rotation, &mut carry, false, false); + self.cpsr.set_C(carry); + v } else { self.load_32(addr, access) } @@ -146,7 +149,10 @@ impl Core { if addr & 0x1 != 0 { let rotation = (addr & 0x1) << 3; let value = self.load_16(addr & !0x1, access); - self.ror(value as u32, rotation, self.cpsr.C(), false, false) + let mut carry = self.cpsr.C(); + let v = self.ror(value as u32, rotation, &mut carry, false, false); + self.cpsr.set_C(carry); + v } else { self.load_16(addr, access) as u32 } diff --git a/core/src/arm7tdmi/thumb/exec.rs b/core/src/arm7tdmi/thumb/exec.rs index 4ff35e8..38f5b2a 100644 --- a/core/src/arm7tdmi/thumb/exec.rs +++ b/core/src/arm7tdmi/thumb/exec.rs @@ -15,15 +15,16 @@ impl Core { let rs = insn.bit_range(3..6) as usize; let shift_amount = insn.offset5() as u8 as u32; + let mut carry = self.cpsr.C(); let op2 = self.barrel_shift_op( insn.format1_op(), self.gpr[rs], shift_amount, - self.cpsr.C(), + &mut carry, true, ); self.gpr[rd] = op2; - self.alu_update_flags(op2, false, self.bs_carry_out, self.cpsr.V()); + self.alu_update_flags(op2, false, carry, self.cpsr.V()); CpuAction::AdvancePC(Seq) } @@ -95,9 +96,8 @@ impl Core { macro_rules! shifter_op { ($bs_op:expr) => {{ - let result = self.shift_by_register($bs_op, rd, rs, carry); + let result = self.shift_by_register($bs_op, rd, rs, &mut carry); self.idle_cycle(); - carry = self.bs_carry_out; result }}; } diff --git a/core/src/cartridge/backup/eeprom.rs b/core/src/cartridge/backup/eeprom.rs index cea1bef..ff74075 100644 --- a/core/src/cartridge/backup/eeprom.rs +++ b/core/src/cartridge/backup/eeprom.rs @@ -336,7 +336,10 @@ impl EepromController { match (src, dst) { // DMA to EEPROM (_, 0x0d000000..=0x0dffffff) => { - debug!("caught eeprom dma transfer src={:#x} dst={:#x} count={}", src, dst, count); + debug!( + "caught eeprom dma transfer src={:#x} dst={:#x} count={}", + src, dst, count + ); let eeprom_type = match count { // Read(11) + 6bit address + stop bit 9 => Eeprom512, @@ -346,7 +349,10 @@ impl EepromController { 73 => Eeprom512, // Write(11) + 14bit address + 64bit value + stop bit 81 => Eeprom8k, - _ => panic!("unexpected bit count ({}) when detecting eeprom size", count) + _ => panic!( + "unexpected bit count ({}) when detecting eeprom size", + count + ), }; info!("detected eeprom type: {:?}", eeprom_type); self.chip.borrow_mut().set_type(eeprom_type); @@ -356,7 +362,7 @@ impl EepromController { (0x0d000000..=0x0dffffff, _) => { panic!("reading from eeprom when real size is not detected yet is not supported by this emulator") } - _ => {/* Not a eeprom dma, doing nothing */} + _ => { /* Not a eeprom dma, doing nothing */ } } } else { // this might be a eeprom request, so we need to reset the eeprom state machine if its dirty (due to bad behaving games, or tests roms) diff --git a/core/src/sound/mod.rs b/core/src/sound/mod.rs index 9e23e54..576e0bb 100644 --- a/core/src/sound/mod.rs +++ b/core/src/sound/mod.rs @@ -291,10 +291,7 @@ impl SoundController { self.resampler.in_freq = self.sample_rate; } self.cycles_per_sample = 512 >> resolution; - info!( - "bias - setting sample frequency to {}hz", - self.sample_rate - ); + info!("bias - setting sample frequency to {}hz", self.sample_rate); // TODO this will not affect the currently scheduled sample event }