From c6feb5d500119d2a5751d0092d3f8b31ca00f812 Mon Sep 17 00:00:00 2001 From: Michel Heily Date: Sat, 11 Jan 2020 15:58:32 +0200 Subject: [PATCH] core: Optimizing GameBoyAdvance::frame Profiling GameBoyAdvance::frame shows that it spends way too much time idleing on SoundController::update & Gpu::step waiting for the cycle count to reach to the next event, Consuming cpu time for the Arm7tdmi core. This commit changes the implementation of the main loop functions so that the CPU will run as many cycles as possible and the peripherals will only be updated when needed. The is a performance improvement of roughly 50% in fps in some games! Former-commit-id: 937e097f958423934c70b7face6b6b02926b7a51 --- src/core/dma.rs | 27 ++++++-------- src/core/gba.rs | 73 +++++++++++++++++++++++++------------ src/core/gpu/mod.rs | 59 ++++++++++++++++++++---------- src/core/sound/mod.rs | 16 +++++--- src/core/timer.rs | 85 ++++++++++++++++++++++++------------------- src/lib.rs | 1 + 6 files changed, 159 insertions(+), 102 deletions(-) diff --git a/src/core/dma.rs b/src/core/dma.rs index 71a3f3a..f37b231 100644 --- a/src/core/dma.rs +++ b/src/core/dma.rs @@ -1,11 +1,8 @@ -extern crate bit_set; - -use bit_set::BitSet; - use super::iodev::consts::{REG_FIFO_A, REG_FIFO_B}; use super::sysbus::SysBus; use super::{Addr, Bus, Interrupt, IrqBitmask}; +use bit_set::BitSet; use num::FromPrimitive; #[derive(Debug)] @@ -178,7 +175,7 @@ impl DmaChannel { #[derive(Debug)] pub struct DmaController { pub channels: [DmaChannel; 4], - pending_bittset: BitSet, + pending_set: BitSet, cycles: usize, } @@ -191,20 +188,20 @@ impl DmaController { DmaChannel::new(2), DmaChannel::new(3), ], - pending_bittset: BitSet::with_capacity(4), + pending_set: BitSet::with_capacity(4), cycles: 0, } } - pub fn has_work(&self) -> bool { - !self.pending_bittset.is_empty() + pub fn is_active(&self) -> bool { + !self.pending_set.is_empty() } pub fn perform_work(&mut self, sb: &mut SysBus, irqs: &mut IrqBitmask) { - for id in self.pending_bittset.iter() { + for id in self.pending_set.iter() { self.channels[id].xfer(sb, irqs); } - self.pending_bittset.clear(); + self.pending_set.clear(); } pub fn write_16(&mut self, channel_id: usize, ofs: u32, value: u16) { @@ -216,9 +213,9 @@ impl DmaController { 8 => self.channels[channel_id].write_word_count(value), 10 => { if self.channels[channel_id].write_dma_ctrl(value) { - self.pending_bittset.insert(channel_id); + self.pending_set.insert(channel_id); } else { - self.pending_bittset.remove(channel_id); + self.pending_set.remove(channel_id); } } _ => panic!("Invalid dma offset {:x}", ofs), @@ -228,7 +225,7 @@ impl DmaController { pub fn notify_vblank(&mut self) { for i in 0..4 { if self.channels[i].ctrl.is_enabled() && self.channels[i].ctrl.timing() == 1 { - self.pending_bittset.insert(i); + self.pending_set.insert(i); } } } @@ -236,7 +233,7 @@ impl DmaController { pub fn notify_hblank(&mut self) { for i in 0..4 { if self.channels[i].ctrl.is_enabled() && self.channels[i].ctrl.timing() == 2 { - self.pending_bittset.insert(i); + self.pending_set.insert(i); } } } @@ -248,7 +245,7 @@ impl DmaController { && self.channels[i].ctrl.timing() == 3 && self.channels[i].dst == fifo_addr { - self.pending_bittset.insert(i); + self.pending_set.insert(i); } } } diff --git a/src/core/gba.rs b/src/core/gba.rs index 502b672..5da5a89 100644 --- a/src/core/gba.rs +++ b/src/core/gba.rs @@ -16,6 +16,8 @@ pub struct GameBoyAdvance { pub sysbus: Box, pub cpu: Core, input_device: Rc>, + + cycles_to_next_event: usize, } impl GameBoyAdvance { @@ -34,6 +36,8 @@ impl GameBoyAdvance { cpu: cpu, sysbus: Box::new(SysBus::new(io, bios_rom, gamepak)), input_device: input_device, + + cycles_to_next_event: 1, } } @@ -74,41 +78,64 @@ impl GameBoyAdvance { None } - pub fn step(&mut self) { - let mut irqs = IrqBitmask(0); + fn step_cpu(&mut self, io: &mut IoDevices) -> usize { + if io.intc.irq_pending() + && self.cpu.last_executed.is_some() + && !self.cpu.did_pipeline_flush() + { + self.cpu.irq(&mut self.sysbus); + io.haltcnt = HaltState::Running; + } let previous_cycles = self.cpu.cycles; + self.cpu.step(&mut self.sysbus); + self.cpu.cycles - previous_cycles + } + pub fn step(&mut self) { // // I hate myself for doing this, but rust left me no choice. let io = unsafe { let ptr = &mut *self.sysbus as *mut SysBus; &mut (*ptr).io as &mut IoDevices }; - let cycles = if !io.dmac.has_work() { - if io.intc.irq_pending() - && self.cpu.last_executed.is_some() - && !self.cpu.did_pipeline_flush() - { - self.cpu.irq(&mut self.sysbus); - io.haltcnt = HaltState::Running; - } + let mut irqs = IrqBitmask(0); - if HaltState::Running == io.haltcnt { - self.cpu.step(&mut self.sysbus).unwrap(); - self.cpu.cycles - previous_cycles + let mut cycles_left = self.cycles_to_next_event; + let mut cycles_to_next_event = std::usize::MAX; + let mut cycles = 0; + + while cycles_left > 0 { + let mut irqs = IrqBitmask(0); + let _cycles = if !io.dmac.is_active() { + if HaltState::Running == io.haltcnt { + self.step_cpu(io) + } else { + cycles = cycles_left; + break; + } } else { - 1 + io.dmac.perform_work(&mut self.sysbus, &mut irqs); + io.intc.request_irqs(irqs); + return; + }; + + cycles += _cycles; + if cycles_left < _cycles { + break; } - } else { - io.dmac.perform_work(&mut self.sysbus, &mut irqs); - 0 - }; - - io.timers.step(cycles, &mut self.sysbus, &mut irqs); - - io.gpu.step(cycles, &mut self.sysbus, &mut irqs); + cycles_left -= _cycles; + } + // update gpu & sound + io.timers.update(cycles, &mut self.sysbus, &mut irqs); + io.gpu.step( + cycles, + &mut self.sysbus, + &mut irqs, + &mut cycles_to_next_event, + ); + io.sound.update(cycles, &mut cycles_to_next_event); + self.cycles_to_next_event = cycles_to_next_event; io.intc.request_irqs(irqs); - io.sound.update(self.cpu.cycles); } } diff --git a/src/core/gpu/mod.rs b/src/core/gpu/mod.rs index fdd3b72..18e18d9 100644 --- a/src/core/gpu/mod.rs +++ b/src/core/gpu/mod.rs @@ -180,7 +180,9 @@ pub struct Gpu { #[debug_stub = "video handle"] video_device: VideoDeviceRcRefCell, pub state: GpuState, - cycles: usize, + + /// how many cycles left until next gpu state ? + cycles_left_for_current_state: usize, // registers pub vcount: usize, // VCOUNT @@ -231,7 +233,7 @@ impl Gpu { state: HDraw, vcount: 0, - cycles: 0, + cycles_left_for_current_state: CYCLES_HDRAW, palette_ram: BoxedMemory::new(vec![0; PALETTE_RAM_SIZE].into_boxed_slice()), vram: BoxedMemory::new(vec![0; VIDEO_RAM_SIZE].into_boxed_slice()), @@ -342,34 +344,39 @@ impl Gpu { } // Returns the new gpu state - pub fn step(&mut self, cycles: usize, sb: &mut SysBus, irqs: &mut IrqBitmask) { - self.cycles += cycles; + pub fn step( + &mut self, + cycles: usize, + sb: &mut SysBus, + irqs: &mut IrqBitmask, + cycles_to_next_event: &mut usize, + ) { + if self.cycles_left_for_current_state <= cycles { + let overshoot = cycles - self.cycles_left_for_current_state; - match self.state { - HDraw => { - if self.cycles > CYCLES_HDRAW { - self.cycles -= CYCLES_HDRAW; - // HBlank + // handle the state change + match self.state { + HDraw => { + // Transition to HBlank + self.state = HBlank; + self.cycles_left_for_current_state = CYCLES_HBLANK; self.dispstat.set_hblank_flag(true); if self.dispstat.hblank_irq_enable() { irqs.set_LCD_HBlank(true); }; - self.state = HBlank; sb.io.dmac.notify_hblank(); } - } - HBlank => { - if self.cycles > CYCLES_HBLANK { - self.cycles -= CYCLES_HBLANK; - + HBlank => { self.dispstat.set_hblank_flag(false); self.update_vcount(self.vcount + 1, irqs); if self.vcount < DISPLAY_HEIGHT { self.render_scanline(); self.state = HDraw; + self.cycles_left_for_current_state = CYCLES_HDRAW; } else { self.state = VBlank; + self.cycles_left_for_current_state = CYCLES_SCANLINE; self.dispstat.set_vblank_flag(true); if self.dispstat.vblank_irq_enable() { irqs.set_LCD_VBlank(true); @@ -378,21 +385,33 @@ impl Gpu { self.video_device.borrow_mut().render(&self.frame_buffer); } } - } - VBlank => { - if self.cycles > CYCLES_SCANLINE { - self.cycles -= CYCLES_SCANLINE; - + VBlank => { if self.vcount < DISPLAY_HEIGHT + VBLANK_LINES - 1 { self.update_vcount(self.vcount + 1, irqs); + self.cycles_left_for_current_state = CYCLES_SCANLINE; } else { self.update_vcount(0, irqs); self.dispstat.set_vblank_flag(false); self.render_scanline(); self.state = HDraw; + + self.cycles_left_for_current_state = CYCLES_HDRAW; } } + }; + + // handle the overshoot + if overshoot < self.cycles_left_for_current_state { + self.cycles_left_for_current_state -= overshoot; + } else { + panic!("OH SHIT"); } + } else { + self.cycles_left_for_current_state -= cycles; + } + + if self.cycles_left_for_current_state < *cycles_to_next_event { + *cycles_to_next_event = self.cycles_left_for_current_state; } } } diff --git a/src/core/sound/mod.rs b/src/core/sound/mod.rs index 79f0049..d030401 100644 --- a/src/core/sound/mod.rs +++ b/src/core/sound/mod.rs @@ -68,7 +68,7 @@ pub struct SoundController { audio_device: AudioDeviceRcRefCell, sample_rate_to_cpu_freq: usize, // how many "cycles" are a sample? - last_sample_cycles: usize, // cycles count when we last provided a new sample. + cycles: usize, // cycles count when we last provided a new sample. mse: bool, @@ -114,7 +114,7 @@ impl SoundController { audio_device: audio_device, sample_rate_to_cpu_freq: 12345, - last_sample_cycles: 0, + cycles: 0, mse: false, left_volume: 0, left_sqr1: false, @@ -286,7 +286,7 @@ impl SoundController { self.resampler.in_freq = self.sample_rate; } self.cycles_per_sample = 512 >> resolution; - }, + } _ => { // println!( @@ -321,9 +321,10 @@ impl SoundController { } } - pub fn update(&mut self, cycles: usize) { - while cycles - self.last_sample_cycles >= self.cycles_per_sample { - self.last_sample_cycles += self.cycles_per_sample; + pub fn update(&mut self, cycles: usize, cycles_to_next_event: &mut usize) { + self.cycles += cycles; + while self.cycles >= self.cycles_per_sample { + self.cycles -= self.cycles_per_sample; // time to push a new sample! @@ -341,6 +342,9 @@ impl SoundController { self.resampler .push_sample((sample[0], sample[1]), &mut *audio); } + if self.cycles_per_sample < *cycles_to_next_event { + *cycles_to_next_event = self.cycles_per_sample; + } } } diff --git a/src/core/timer.rs b/src/core/timer.rs index a1d0b4c..ba7449a 100644 --- a/src/core/timer.rs +++ b/src/core/timer.rs @@ -2,6 +2,8 @@ use super::interrupt::{Interrupt, IrqBitmask}; use super::iodev::consts::*; use super::sysbus::SysBus; +use bit_set::BitSet; + use num::FromPrimitive; #[derive(Debug)] @@ -42,11 +44,32 @@ impl Timer { _ => unreachable!(), } } + + /// updates the timer with 'cycles' amount of cycles, returns the number of times it overflowed + fn update(&mut self, cycles: usize, irqs: &mut IrqBitmask) -> usize { + self.cycles += cycles; + let mut num_overflows = 0; + let freq = self.frequency(); + while self.cycles >= freq { + self.cycles -= freq; + self.data = self.data.wrapping_add(1); + if self.data == 0 { + if self.ctl.irq_enabled() { + irqs.add_irq(self.irq); + } + self.data = self.initial_data; + num_overflows += 1; + } + } + + num_overflows + } } #[derive(Debug)] pub struct Timers { timers: [Timer; 4], + running_timers: BitSet, pub trace: bool, } @@ -67,6 +90,7 @@ impl Timers { pub fn new() -> Timers { Timers { timers: [Timer::new(0), Timer::new(1), Timer::new(2), Timer::new(3)], + running_timers: BitSet::with_capacity(4), trace: false, } } @@ -75,6 +99,12 @@ impl Timers { let old_enabled = self[id].ctl.enabled(); self[id].ctl.0 = value; let new_enabled = self[id].ctl.enabled(); + let cascade = self.timers[id].ctl.cascade(); + if new_enabled && !cascade { + self.running_timers.insert(id); + } else { + self.running_timers.remove(id); + } if self.trace && old_enabled != new_enabled { println!( "TMR{} {}", @@ -127,45 +157,24 @@ impl Timers { } } - fn update_timer(&mut self, id: usize, cycles: usize, sb: &mut SysBus, irqs: &mut IrqBitmask) { - let timer = &mut self.timers[id]; - timer.cycles += cycles; - let mut num_overflows = 0; - let freq = timer.frequency(); - while timer.cycles >= freq { - timer.cycles -= freq; - timer.data = timer.data.wrapping_add(1); - if timer.data == 0 { - if self.trace { - println!("TMR{} overflown!", id); + pub fn update(&mut self, cycles: usize, sb: &mut SysBus, irqs: &mut IrqBitmask) { + for id in self.running_timers.iter() { + if !self.timers[id].ctl.cascade() { + let timer = &mut self.timers[id]; + let num_overflows = timer.update(cycles, irqs); + if num_overflows > 0 { + if id != 3 { + let next_timer = &mut self.timers[id + 1]; + if next_timer.ctl.cascade() { + next_timer.update(num_overflows, irqs); + } + } + if id == 0 || id == 1 { + sb.io + .sound + .handle_timer_overflow(&mut sb.io.dmac, id, num_overflows); + } } - if timer.ctl.irq_enabled() { - irqs.add_irq(timer.irq); - } - timer.data = timer.initial_data; - num_overflows += 1; - } - } - - if num_overflows > 0 { - if id != 3 { - let next_timer = &mut self.timers[id + 1]; - if next_timer.ctl.cascade() { - self.update_timer(id + 1, num_overflows, sb, irqs); - } - } - if id == 0 || id == 1 { - sb.io - .sound - .handle_timer_overflow(&mut sb.io.dmac, id, num_overflows); - } - } - } - - pub fn step(&mut self, cycles: usize, sb: &mut SysBus, irqs: &mut IrqBitmask) { - for i in 0..4 { - if self.timers[i].ctl.enabled() && !self.timers[i].ctl.cascade() { - self.update_timer(i, cycles, sb, irqs); } } } diff --git a/src/lib.rs b/src/lib.rs index 33e4fed..8dbed54 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -15,6 +15,7 @@ extern crate bit; extern crate bitfield; #[macro_use] extern crate bitflags; +extern crate bit_set; extern crate byteorder;