core: Optimizing GameBoyAdvance::frame

Profiling GameBoyAdvance::frame shows that it spends way too much time idleing
on SoundController::update & Gpu::step waiting for the cycle count to
reach to the next event, Consuming cpu time for the Arm7tdmi core.

This commit changes the implementation of the main loop functions so
that the CPU will run as many cycles as possible and the peripherals
will only be updated when needed.
The is a performance improvement of roughly 50% in fps in some games!


Former-commit-id: 937e097f958423934c70b7face6b6b02926b7a51
This commit is contained in:
Michel Heily 2020-01-11 15:58:32 +02:00
parent 36cf4e62ce
commit c6feb5d500
6 changed files with 159 additions and 102 deletions

View file

@ -1,11 +1,8 @@
extern crate bit_set;
use bit_set::BitSet;
use super::iodev::consts::{REG_FIFO_A, REG_FIFO_B}; use super::iodev::consts::{REG_FIFO_A, REG_FIFO_B};
use super::sysbus::SysBus; use super::sysbus::SysBus;
use super::{Addr, Bus, Interrupt, IrqBitmask}; use super::{Addr, Bus, Interrupt, IrqBitmask};
use bit_set::BitSet;
use num::FromPrimitive; use num::FromPrimitive;
#[derive(Debug)] #[derive(Debug)]
@ -178,7 +175,7 @@ impl DmaChannel {
#[derive(Debug)] #[derive(Debug)]
pub struct DmaController { pub struct DmaController {
pub channels: [DmaChannel; 4], pub channels: [DmaChannel; 4],
pending_bittset: BitSet, pending_set: BitSet,
cycles: usize, cycles: usize,
} }
@ -191,20 +188,20 @@ impl DmaController {
DmaChannel::new(2), DmaChannel::new(2),
DmaChannel::new(3), DmaChannel::new(3),
], ],
pending_bittset: BitSet::with_capacity(4), pending_set: BitSet::with_capacity(4),
cycles: 0, cycles: 0,
} }
} }
pub fn has_work(&self) -> bool { pub fn is_active(&self) -> bool {
!self.pending_bittset.is_empty() !self.pending_set.is_empty()
} }
pub fn perform_work(&mut self, sb: &mut SysBus, irqs: &mut IrqBitmask) { pub fn perform_work(&mut self, sb: &mut SysBus, irqs: &mut IrqBitmask) {
for id in self.pending_bittset.iter() { for id in self.pending_set.iter() {
self.channels[id].xfer(sb, irqs); self.channels[id].xfer(sb, irqs);
} }
self.pending_bittset.clear(); self.pending_set.clear();
} }
pub fn write_16(&mut self, channel_id: usize, ofs: u32, value: u16) { pub fn write_16(&mut self, channel_id: usize, ofs: u32, value: u16) {
@ -216,9 +213,9 @@ impl DmaController {
8 => self.channels[channel_id].write_word_count(value), 8 => self.channels[channel_id].write_word_count(value),
10 => { 10 => {
if self.channels[channel_id].write_dma_ctrl(value) { if self.channels[channel_id].write_dma_ctrl(value) {
self.pending_bittset.insert(channel_id); self.pending_set.insert(channel_id);
} else { } else {
self.pending_bittset.remove(channel_id); self.pending_set.remove(channel_id);
} }
} }
_ => panic!("Invalid dma offset {:x}", ofs), _ => panic!("Invalid dma offset {:x}", ofs),
@ -228,7 +225,7 @@ impl DmaController {
pub fn notify_vblank(&mut self) { pub fn notify_vblank(&mut self) {
for i in 0..4 { for i in 0..4 {
if self.channels[i].ctrl.is_enabled() && self.channels[i].ctrl.timing() == 1 { if self.channels[i].ctrl.is_enabled() && self.channels[i].ctrl.timing() == 1 {
self.pending_bittset.insert(i); self.pending_set.insert(i);
} }
} }
} }
@ -236,7 +233,7 @@ impl DmaController {
pub fn notify_hblank(&mut self) { pub fn notify_hblank(&mut self) {
for i in 0..4 { for i in 0..4 {
if self.channels[i].ctrl.is_enabled() && self.channels[i].ctrl.timing() == 2 { if self.channels[i].ctrl.is_enabled() && self.channels[i].ctrl.timing() == 2 {
self.pending_bittset.insert(i); self.pending_set.insert(i);
} }
} }
} }
@ -248,7 +245,7 @@ impl DmaController {
&& self.channels[i].ctrl.timing() == 3 && self.channels[i].ctrl.timing() == 3
&& self.channels[i].dst == fifo_addr && self.channels[i].dst == fifo_addr
{ {
self.pending_bittset.insert(i); self.pending_set.insert(i);
} }
} }
} }

View file

@ -16,6 +16,8 @@ pub struct GameBoyAdvance {
pub sysbus: Box<SysBus>, pub sysbus: Box<SysBus>,
pub cpu: Core, pub cpu: Core,
input_device: Rc<RefCell<dyn InputInterface>>, input_device: Rc<RefCell<dyn InputInterface>>,
cycles_to_next_event: usize,
} }
impl GameBoyAdvance { impl GameBoyAdvance {
@ -34,6 +36,8 @@ impl GameBoyAdvance {
cpu: cpu, cpu: cpu,
sysbus: Box::new(SysBus::new(io, bios_rom, gamepak)), sysbus: Box::new(SysBus::new(io, bios_rom, gamepak)),
input_device: input_device, input_device: input_device,
cycles_to_next_event: 1,
} }
} }
@ -74,41 +78,64 @@ impl GameBoyAdvance {
None None
} }
pub fn step(&mut self) { fn step_cpu(&mut self, io: &mut IoDevices) -> usize {
let mut irqs = IrqBitmask(0); if io.intc.irq_pending()
&& self.cpu.last_executed.is_some()
&& !self.cpu.did_pipeline_flush()
{
self.cpu.irq(&mut self.sysbus);
io.haltcnt = HaltState::Running;
}
let previous_cycles = self.cpu.cycles; let previous_cycles = self.cpu.cycles;
self.cpu.step(&mut self.sysbus);
self.cpu.cycles - previous_cycles
}
pub fn step(&mut self) {
// // I hate myself for doing this, but rust left me no choice. // // I hate myself for doing this, but rust left me no choice.
let io = unsafe { let io = unsafe {
let ptr = &mut *self.sysbus as *mut SysBus; let ptr = &mut *self.sysbus as *mut SysBus;
&mut (*ptr).io as &mut IoDevices &mut (*ptr).io as &mut IoDevices
}; };
let cycles = if !io.dmac.has_work() { let mut irqs = IrqBitmask(0);
if io.intc.irq_pending()
&& self.cpu.last_executed.is_some()
&& !self.cpu.did_pipeline_flush()
{
self.cpu.irq(&mut self.sysbus);
io.haltcnt = HaltState::Running;
}
if HaltState::Running == io.haltcnt { let mut cycles_left = self.cycles_to_next_event;
self.cpu.step(&mut self.sysbus).unwrap(); let mut cycles_to_next_event = std::usize::MAX;
self.cpu.cycles - previous_cycles let mut cycles = 0;
while cycles_left > 0 {
let mut irqs = IrqBitmask(0);
let _cycles = if !io.dmac.is_active() {
if HaltState::Running == io.haltcnt {
self.step_cpu(io)
} else {
cycles = cycles_left;
break;
}
} else { } else {
1 io.dmac.perform_work(&mut self.sysbus, &mut irqs);
io.intc.request_irqs(irqs);
return;
};
cycles += _cycles;
if cycles_left < _cycles {
break;
} }
} else { cycles_left -= _cycles;
io.dmac.perform_work(&mut self.sysbus, &mut irqs); }
0
};
io.timers.step(cycles, &mut self.sysbus, &mut irqs);
io.gpu.step(cycles, &mut self.sysbus, &mut irqs);
// update gpu & sound
io.timers.update(cycles, &mut self.sysbus, &mut irqs);
io.gpu.step(
cycles,
&mut self.sysbus,
&mut irqs,
&mut cycles_to_next_event,
);
io.sound.update(cycles, &mut cycles_to_next_event);
self.cycles_to_next_event = cycles_to_next_event;
io.intc.request_irqs(irqs); io.intc.request_irqs(irqs);
io.sound.update(self.cpu.cycles);
} }
} }

View file

@ -180,7 +180,9 @@ pub struct Gpu {
#[debug_stub = "video handle"] #[debug_stub = "video handle"]
video_device: VideoDeviceRcRefCell, video_device: VideoDeviceRcRefCell,
pub state: GpuState, pub state: GpuState,
cycles: usize,
/// how many cycles left until next gpu state ?
cycles_left_for_current_state: usize,
// registers // registers
pub vcount: usize, // VCOUNT pub vcount: usize, // VCOUNT
@ -231,7 +233,7 @@ impl Gpu {
state: HDraw, state: HDraw,
vcount: 0, vcount: 0,
cycles: 0, cycles_left_for_current_state: CYCLES_HDRAW,
palette_ram: BoxedMemory::new(vec![0; PALETTE_RAM_SIZE].into_boxed_slice()), palette_ram: BoxedMemory::new(vec![0; PALETTE_RAM_SIZE].into_boxed_slice()),
vram: BoxedMemory::new(vec![0; VIDEO_RAM_SIZE].into_boxed_slice()), vram: BoxedMemory::new(vec![0; VIDEO_RAM_SIZE].into_boxed_slice()),
@ -342,34 +344,39 @@ impl Gpu {
} }
// Returns the new gpu state // Returns the new gpu state
pub fn step(&mut self, cycles: usize, sb: &mut SysBus, irqs: &mut IrqBitmask) { pub fn step(
self.cycles += cycles; &mut self,
cycles: usize,
sb: &mut SysBus,
irqs: &mut IrqBitmask,
cycles_to_next_event: &mut usize,
) {
if self.cycles_left_for_current_state <= cycles {
let overshoot = cycles - self.cycles_left_for_current_state;
match self.state { // handle the state change
HDraw => { match self.state {
if self.cycles > CYCLES_HDRAW { HDraw => {
self.cycles -= CYCLES_HDRAW; // Transition to HBlank
// HBlank self.state = HBlank;
self.cycles_left_for_current_state = CYCLES_HBLANK;
self.dispstat.set_hblank_flag(true); self.dispstat.set_hblank_flag(true);
if self.dispstat.hblank_irq_enable() { if self.dispstat.hblank_irq_enable() {
irqs.set_LCD_HBlank(true); irqs.set_LCD_HBlank(true);
}; };
self.state = HBlank;
sb.io.dmac.notify_hblank(); sb.io.dmac.notify_hblank();
} }
} HBlank => {
HBlank => {
if self.cycles > CYCLES_HBLANK {
self.cycles -= CYCLES_HBLANK;
self.dispstat.set_hblank_flag(false); self.dispstat.set_hblank_flag(false);
self.update_vcount(self.vcount + 1, irqs); self.update_vcount(self.vcount + 1, irqs);
if self.vcount < DISPLAY_HEIGHT { if self.vcount < DISPLAY_HEIGHT {
self.render_scanline(); self.render_scanline();
self.state = HDraw; self.state = HDraw;
self.cycles_left_for_current_state = CYCLES_HDRAW;
} else { } else {
self.state = VBlank; self.state = VBlank;
self.cycles_left_for_current_state = CYCLES_SCANLINE;
self.dispstat.set_vblank_flag(true); self.dispstat.set_vblank_flag(true);
if self.dispstat.vblank_irq_enable() { if self.dispstat.vblank_irq_enable() {
irqs.set_LCD_VBlank(true); irqs.set_LCD_VBlank(true);
@ -378,21 +385,33 @@ impl Gpu {
self.video_device.borrow_mut().render(&self.frame_buffer); self.video_device.borrow_mut().render(&self.frame_buffer);
} }
} }
} VBlank => {
VBlank => {
if self.cycles > CYCLES_SCANLINE {
self.cycles -= CYCLES_SCANLINE;
if self.vcount < DISPLAY_HEIGHT + VBLANK_LINES - 1 { if self.vcount < DISPLAY_HEIGHT + VBLANK_LINES - 1 {
self.update_vcount(self.vcount + 1, irqs); self.update_vcount(self.vcount + 1, irqs);
self.cycles_left_for_current_state = CYCLES_SCANLINE;
} else { } else {
self.update_vcount(0, irqs); self.update_vcount(0, irqs);
self.dispstat.set_vblank_flag(false); self.dispstat.set_vblank_flag(false);
self.render_scanline(); self.render_scanline();
self.state = HDraw; self.state = HDraw;
self.cycles_left_for_current_state = CYCLES_HDRAW;
} }
} }
};
// handle the overshoot
if overshoot < self.cycles_left_for_current_state {
self.cycles_left_for_current_state -= overshoot;
} else {
panic!("OH SHIT");
} }
} else {
self.cycles_left_for_current_state -= cycles;
}
if self.cycles_left_for_current_state < *cycles_to_next_event {
*cycles_to_next_event = self.cycles_left_for_current_state;
} }
} }
} }

View file

@ -68,7 +68,7 @@ pub struct SoundController {
audio_device: AudioDeviceRcRefCell, audio_device: AudioDeviceRcRefCell,
sample_rate_to_cpu_freq: usize, // how many "cycles" are a sample? sample_rate_to_cpu_freq: usize, // how many "cycles" are a sample?
last_sample_cycles: usize, // cycles count when we last provided a new sample. cycles: usize, // cycles count when we last provided a new sample.
mse: bool, mse: bool,
@ -114,7 +114,7 @@ impl SoundController {
audio_device: audio_device, audio_device: audio_device,
sample_rate_to_cpu_freq: 12345, sample_rate_to_cpu_freq: 12345,
last_sample_cycles: 0, cycles: 0,
mse: false, mse: false,
left_volume: 0, left_volume: 0,
left_sqr1: false, left_sqr1: false,
@ -286,7 +286,7 @@ impl SoundController {
self.resampler.in_freq = self.sample_rate; self.resampler.in_freq = self.sample_rate;
} }
self.cycles_per_sample = 512 >> resolution; self.cycles_per_sample = 512 >> resolution;
}, }
_ => { _ => {
// println!( // println!(
@ -321,9 +321,10 @@ impl SoundController {
} }
} }
pub fn update(&mut self, cycles: usize) { pub fn update(&mut self, cycles: usize, cycles_to_next_event: &mut usize) {
while cycles - self.last_sample_cycles >= self.cycles_per_sample { self.cycles += cycles;
self.last_sample_cycles += self.cycles_per_sample; while self.cycles >= self.cycles_per_sample {
self.cycles -= self.cycles_per_sample;
// time to push a new sample! // time to push a new sample!
@ -341,6 +342,9 @@ impl SoundController {
self.resampler self.resampler
.push_sample((sample[0], sample[1]), &mut *audio); .push_sample((sample[0], sample[1]), &mut *audio);
} }
if self.cycles_per_sample < *cycles_to_next_event {
*cycles_to_next_event = self.cycles_per_sample;
}
} }
} }

View file

@ -2,6 +2,8 @@ use super::interrupt::{Interrupt, IrqBitmask};
use super::iodev::consts::*; use super::iodev::consts::*;
use super::sysbus::SysBus; use super::sysbus::SysBus;
use bit_set::BitSet;
use num::FromPrimitive; use num::FromPrimitive;
#[derive(Debug)] #[derive(Debug)]
@ -42,11 +44,32 @@ impl Timer {
_ => unreachable!(), _ => unreachable!(),
} }
} }
/// updates the timer with 'cycles' amount of cycles, returns the number of times it overflowed
fn update(&mut self, cycles: usize, irqs: &mut IrqBitmask) -> usize {
self.cycles += cycles;
let mut num_overflows = 0;
let freq = self.frequency();
while self.cycles >= freq {
self.cycles -= freq;
self.data = self.data.wrapping_add(1);
if self.data == 0 {
if self.ctl.irq_enabled() {
irqs.add_irq(self.irq);
}
self.data = self.initial_data;
num_overflows += 1;
}
}
num_overflows
}
} }
#[derive(Debug)] #[derive(Debug)]
pub struct Timers { pub struct Timers {
timers: [Timer; 4], timers: [Timer; 4],
running_timers: BitSet,
pub trace: bool, pub trace: bool,
} }
@ -67,6 +90,7 @@ impl Timers {
pub fn new() -> Timers { pub fn new() -> Timers {
Timers { Timers {
timers: [Timer::new(0), Timer::new(1), Timer::new(2), Timer::new(3)], timers: [Timer::new(0), Timer::new(1), Timer::new(2), Timer::new(3)],
running_timers: BitSet::with_capacity(4),
trace: false, trace: false,
} }
} }
@ -75,6 +99,12 @@ impl Timers {
let old_enabled = self[id].ctl.enabled(); let old_enabled = self[id].ctl.enabled();
self[id].ctl.0 = value; self[id].ctl.0 = value;
let new_enabled = self[id].ctl.enabled(); let new_enabled = self[id].ctl.enabled();
let cascade = self.timers[id].ctl.cascade();
if new_enabled && !cascade {
self.running_timers.insert(id);
} else {
self.running_timers.remove(id);
}
if self.trace && old_enabled != new_enabled { if self.trace && old_enabled != new_enabled {
println!( println!(
"TMR{} {}", "TMR{} {}",
@ -127,45 +157,24 @@ impl Timers {
} }
} }
fn update_timer(&mut self, id: usize, cycles: usize, sb: &mut SysBus, irqs: &mut IrqBitmask) { pub fn update(&mut self, cycles: usize, sb: &mut SysBus, irqs: &mut IrqBitmask) {
let timer = &mut self.timers[id]; for id in self.running_timers.iter() {
timer.cycles += cycles; if !self.timers[id].ctl.cascade() {
let mut num_overflows = 0; let timer = &mut self.timers[id];
let freq = timer.frequency(); let num_overflows = timer.update(cycles, irqs);
while timer.cycles >= freq { if num_overflows > 0 {
timer.cycles -= freq; if id != 3 {
timer.data = timer.data.wrapping_add(1); let next_timer = &mut self.timers[id + 1];
if timer.data == 0 { if next_timer.ctl.cascade() {
if self.trace { next_timer.update(num_overflows, irqs);
println!("TMR{} overflown!", id); }
}
if id == 0 || id == 1 {
sb.io
.sound
.handle_timer_overflow(&mut sb.io.dmac, id, num_overflows);
}
} }
if timer.ctl.irq_enabled() {
irqs.add_irq(timer.irq);
}
timer.data = timer.initial_data;
num_overflows += 1;
}
}
if num_overflows > 0 {
if id != 3 {
let next_timer = &mut self.timers[id + 1];
if next_timer.ctl.cascade() {
self.update_timer(id + 1, num_overflows, sb, irqs);
}
}
if id == 0 || id == 1 {
sb.io
.sound
.handle_timer_overflow(&mut sb.io.dmac, id, num_overflows);
}
}
}
pub fn step(&mut self, cycles: usize, sb: &mut SysBus, irqs: &mut IrqBitmask) {
for i in 0..4 {
if self.timers[i].ctl.enabled() && !self.timers[i].ctl.cascade() {
self.update_timer(i, cycles, sb, irqs);
} }
} }
} }

View file

@ -15,6 +15,7 @@ extern crate bit;
extern crate bitfield; extern crate bitfield;
#[macro_use] #[macro_use]
extern crate bitflags; extern crate bitflags;
extern crate bit_set;
extern crate byteorder; extern crate byteorder;