From 21708a3d5867f48a063f93fc841c035e6a35d978 Mon Sep 17 00:00:00 2001 From: Michel Heily Date: Sat, 5 Jun 2021 17:25:01 +0300 Subject: [PATCH] [perf] sysbus: Improve add_cycls() Fulfill TODO from long ago, I used perf-record (--call-graph dwarf) and detected that add_cycles() was hot enough, I added 2 optimizations: - Removed bound checks from array accesses - Increase the LUT size to include dummy entries for open-bus to eliminate the if check run_60_frames time: [183.65 ms 183.69 ms 183.73 ms] change: [-9.4414% -9.2849% -9.1315%] (p = 0.00 < 0.05) Performance has improved. Former-commit-id: 1cbb596b856e604ad6c48eb0d47771e7cee44d1e Former-commit-id: 9f15e35237f343d0c816fd9d51d81081736d9e17 --- core/src/sysbus.rs | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/core/src/sysbus.rs b/core/src/sysbus.rs index afa13f0..a78bd67 100644 --- a/core/src/sysbus.rs +++ b/core/src/sysbus.rs @@ -46,23 +46,24 @@ pub mod consts { use consts::*; -const CYCLE_LUT_SIZE: usize = 0x10; +// Only the first 15 entries are actually used, the rest are dummy entries for open-bus +const CYCLE_LUT_SIZE: usize = 0x100; #[derive(Serialize, Deserialize, Clone)] struct CycleLookupTables { - n_cycles32: [usize; CYCLE_LUT_SIZE], - s_cycles32: [usize; CYCLE_LUT_SIZE], - n_cycles16: [usize; CYCLE_LUT_SIZE], - s_cycles16: [usize; CYCLE_LUT_SIZE], + n_cycles32: Box<[usize]>, + s_cycles32: Box<[usize]>, + n_cycles16: Box<[usize]>, + s_cycles16: Box<[usize]>, } impl Default for CycleLookupTables { fn default() -> CycleLookupTables { CycleLookupTables { - n_cycles32: [1; CYCLE_LUT_SIZE], - s_cycles32: [1; CYCLE_LUT_SIZE], - n_cycles16: [1; CYCLE_LUT_SIZE], - s_cycles16: [1; CYCLE_LUT_SIZE], + n_cycles32: vec![1; CYCLE_LUT_SIZE].into_boxed_slice(), + s_cycles32: vec![1; CYCLE_LUT_SIZE].into_boxed_slice(), + n_cycles16: vec![1; CYCLE_LUT_SIZE].into_boxed_slice(), + s_cycles16: vec![1; CYCLE_LUT_SIZE].into_boxed_slice(), } } } @@ -242,26 +243,22 @@ impl SysBus { pub fn add_cycles(&mut self, addr: Addr, access: MemoryAccess, width: MemoryAccessWidth) { use MemoryAccess::*; use MemoryAccessWidth::*; - let page = (addr >> 24) as usize; + let page = ((addr >> 24) & 0xF) as usize; - // TODO optimize out by making the LUTs have 0x100 entries for each possible page ? - let cycles = if page > 0xF { - // open bus - 1 - } else { + let cycles = unsafe { match width { MemoryAccess8 | MemoryAccess16 => match access { - NonSeq => self.cycle_luts.n_cycles16[page], - Seq => self.cycle_luts.s_cycles16[page], + NonSeq => self.cycle_luts.n_cycles16.get_unchecked(page), + Seq => self.cycle_luts.s_cycles16.get_unchecked(page), }, MemoryAccess32 => match access { - NonSeq => self.cycle_luts.n_cycles32[page], - Seq => self.cycle_luts.s_cycles32[page], + NonSeq => self.cycle_luts.n_cycles32.get_unchecked(page), + Seq => self.cycle_luts.s_cycles32.get_unchecked(page), }, } }; - self.scheduler.update(cycles); + self.scheduler.update(*cycles); } /// Helper for "open-bus" accesses