core: Optimizing GameBoyAdvance::frame

Profiling GameBoyAdvance::frame shows that it spends way too much time idleing
on SoundController::update & Gpu::step waiting for the cycle count to
reach to the next event, Consuming cpu time for the Arm7tdmi core.

This commit changes the implementation of the main loop functions so
that the CPU will run as many cycles as possible and the peripherals
will only be updated when needed.
The is a performance improvement of roughly 50% in fps in some games!


Former-commit-id: 937e097f958423934c70b7face6b6b02926b7a51
This commit is contained in:
Michel Heily 2020-01-11 15:58:32 +02:00
parent 36cf4e62ce
commit c6feb5d500
6 changed files with 159 additions and 102 deletions

View file

@ -1,11 +1,8 @@
extern crate bit_set;
use bit_set::BitSet;
use super::iodev::consts::{REG_FIFO_A, REG_FIFO_B};
use super::sysbus::SysBus;
use super::{Addr, Bus, Interrupt, IrqBitmask};
use bit_set::BitSet;
use num::FromPrimitive;
#[derive(Debug)]
@ -178,7 +175,7 @@ impl DmaChannel {
#[derive(Debug)]
pub struct DmaController {
pub channels: [DmaChannel; 4],
pending_bittset: BitSet,
pending_set: BitSet,
cycles: usize,
}
@ -191,20 +188,20 @@ impl DmaController {
DmaChannel::new(2),
DmaChannel::new(3),
],
pending_bittset: BitSet::with_capacity(4),
pending_set: BitSet::with_capacity(4),
cycles: 0,
}
}
pub fn has_work(&self) -> bool {
!self.pending_bittset.is_empty()
pub fn is_active(&self) -> bool {
!self.pending_set.is_empty()
}
pub fn perform_work(&mut self, sb: &mut SysBus, irqs: &mut IrqBitmask) {
for id in self.pending_bittset.iter() {
for id in self.pending_set.iter() {
self.channels[id].xfer(sb, irqs);
}
self.pending_bittset.clear();
self.pending_set.clear();
}
pub fn write_16(&mut self, channel_id: usize, ofs: u32, value: u16) {
@ -216,9 +213,9 @@ impl DmaController {
8 => self.channels[channel_id].write_word_count(value),
10 => {
if self.channels[channel_id].write_dma_ctrl(value) {
self.pending_bittset.insert(channel_id);
self.pending_set.insert(channel_id);
} else {
self.pending_bittset.remove(channel_id);
self.pending_set.remove(channel_id);
}
}
_ => panic!("Invalid dma offset {:x}", ofs),
@ -228,7 +225,7 @@ impl DmaController {
pub fn notify_vblank(&mut self) {
for i in 0..4 {
if self.channels[i].ctrl.is_enabled() && self.channels[i].ctrl.timing() == 1 {
self.pending_bittset.insert(i);
self.pending_set.insert(i);
}
}
}
@ -236,7 +233,7 @@ impl DmaController {
pub fn notify_hblank(&mut self) {
for i in 0..4 {
if self.channels[i].ctrl.is_enabled() && self.channels[i].ctrl.timing() == 2 {
self.pending_bittset.insert(i);
self.pending_set.insert(i);
}
}
}
@ -248,7 +245,7 @@ impl DmaController {
&& self.channels[i].ctrl.timing() == 3
&& self.channels[i].dst == fifo_addr
{
self.pending_bittset.insert(i);
self.pending_set.insert(i);
}
}
}

View file

@ -16,6 +16,8 @@ pub struct GameBoyAdvance {
pub sysbus: Box<SysBus>,
pub cpu: Core,
input_device: Rc<RefCell<dyn InputInterface>>,
cycles_to_next_event: usize,
}
impl GameBoyAdvance {
@ -34,6 +36,8 @@ impl GameBoyAdvance {
cpu: cpu,
sysbus: Box::new(SysBus::new(io, bios_rom, gamepak)),
input_device: input_device,
cycles_to_next_event: 1,
}
}
@ -74,17 +78,7 @@ impl GameBoyAdvance {
None
}
pub fn step(&mut self) {
let mut irqs = IrqBitmask(0);
let previous_cycles = self.cpu.cycles;
// // I hate myself for doing this, but rust left me no choice.
let io = unsafe {
let ptr = &mut *self.sysbus as *mut SysBus;
&mut (*ptr).io as &mut IoDevices
};
let cycles = if !io.dmac.has_work() {
fn step_cpu(&mut self, io: &mut IoDevices) -> usize {
if io.intc.irq_pending()
&& self.cpu.last_executed.is_some()
&& !self.cpu.did_pipeline_flush()
@ -92,23 +86,56 @@ impl GameBoyAdvance {
self.cpu.irq(&mut self.sysbus);
io.haltcnt = HaltState::Running;
}
if HaltState::Running == io.haltcnt {
self.cpu.step(&mut self.sysbus).unwrap();
let previous_cycles = self.cpu.cycles;
self.cpu.step(&mut self.sysbus);
self.cpu.cycles - previous_cycles
}
pub fn step(&mut self) {
// // I hate myself for doing this, but rust left me no choice.
let io = unsafe {
let ptr = &mut *self.sysbus as *mut SysBus;
&mut (*ptr).io as &mut IoDevices
};
let mut irqs = IrqBitmask(0);
let mut cycles_left = self.cycles_to_next_event;
let mut cycles_to_next_event = std::usize::MAX;
let mut cycles = 0;
while cycles_left > 0 {
let mut irqs = IrqBitmask(0);
let _cycles = if !io.dmac.is_active() {
if HaltState::Running == io.haltcnt {
self.step_cpu(io)
} else {
1
cycles = cycles_left;
break;
}
} else {
io.dmac.perform_work(&mut self.sysbus, &mut irqs);
0
io.intc.request_irqs(irqs);
return;
};
io.timers.step(cycles, &mut self.sysbus, &mut irqs);
io.gpu.step(cycles, &mut self.sysbus, &mut irqs);
cycles += _cycles;
if cycles_left < _cycles {
break;
}
cycles_left -= _cycles;
}
// update gpu & sound
io.timers.update(cycles, &mut self.sysbus, &mut irqs);
io.gpu.step(
cycles,
&mut self.sysbus,
&mut irqs,
&mut cycles_to_next_event,
);
io.sound.update(cycles, &mut cycles_to_next_event);
self.cycles_to_next_event = cycles_to_next_event;
io.intc.request_irqs(irqs);
io.sound.update(self.cpu.cycles);
}
}

View file

@ -180,7 +180,9 @@ pub struct Gpu {
#[debug_stub = "video handle"]
video_device: VideoDeviceRcRefCell,
pub state: GpuState,
cycles: usize,
/// how many cycles left until next gpu state ?
cycles_left_for_current_state: usize,
// registers
pub vcount: usize, // VCOUNT
@ -231,7 +233,7 @@ impl Gpu {
state: HDraw,
vcount: 0,
cycles: 0,
cycles_left_for_current_state: CYCLES_HDRAW,
palette_ram: BoxedMemory::new(vec![0; PALETTE_RAM_SIZE].into_boxed_slice()),
vram: BoxedMemory::new(vec![0; VIDEO_RAM_SIZE].into_boxed_slice()),
@ -342,34 +344,39 @@ impl Gpu {
}
// Returns the new gpu state
pub fn step(&mut self, cycles: usize, sb: &mut SysBus, irqs: &mut IrqBitmask) {
self.cycles += cycles;
pub fn step(
&mut self,
cycles: usize,
sb: &mut SysBus,
irqs: &mut IrqBitmask,
cycles_to_next_event: &mut usize,
) {
if self.cycles_left_for_current_state <= cycles {
let overshoot = cycles - self.cycles_left_for_current_state;
// handle the state change
match self.state {
HDraw => {
if self.cycles > CYCLES_HDRAW {
self.cycles -= CYCLES_HDRAW;
// HBlank
// Transition to HBlank
self.state = HBlank;
self.cycles_left_for_current_state = CYCLES_HBLANK;
self.dispstat.set_hblank_flag(true);
if self.dispstat.hblank_irq_enable() {
irqs.set_LCD_HBlank(true);
};
self.state = HBlank;
sb.io.dmac.notify_hblank();
}
}
HBlank => {
if self.cycles > CYCLES_HBLANK {
self.cycles -= CYCLES_HBLANK;
self.dispstat.set_hblank_flag(false);
self.update_vcount(self.vcount + 1, irqs);
if self.vcount < DISPLAY_HEIGHT {
self.render_scanline();
self.state = HDraw;
self.cycles_left_for_current_state = CYCLES_HDRAW;
} else {
self.state = VBlank;
self.cycles_left_for_current_state = CYCLES_SCANLINE;
self.dispstat.set_vblank_flag(true);
if self.dispstat.vblank_irq_enable() {
irqs.set_LCD_VBlank(true);
@ -378,21 +385,33 @@ impl Gpu {
self.video_device.borrow_mut().render(&self.frame_buffer);
}
}
}
VBlank => {
if self.cycles > CYCLES_SCANLINE {
self.cycles -= CYCLES_SCANLINE;
if self.vcount < DISPLAY_HEIGHT + VBLANK_LINES - 1 {
self.update_vcount(self.vcount + 1, irqs);
self.cycles_left_for_current_state = CYCLES_SCANLINE;
} else {
self.update_vcount(0, irqs);
self.dispstat.set_vblank_flag(false);
self.render_scanline();
self.state = HDraw;
self.cycles_left_for_current_state = CYCLES_HDRAW;
}
}
};
// handle the overshoot
if overshoot < self.cycles_left_for_current_state {
self.cycles_left_for_current_state -= overshoot;
} else {
panic!("OH SHIT");
}
} else {
self.cycles_left_for_current_state -= cycles;
}
if self.cycles_left_for_current_state < *cycles_to_next_event {
*cycles_to_next_event = self.cycles_left_for_current_state;
}
}
}

View file

@ -68,7 +68,7 @@ pub struct SoundController {
audio_device: AudioDeviceRcRefCell,
sample_rate_to_cpu_freq: usize, // how many "cycles" are a sample?
last_sample_cycles: usize, // cycles count when we last provided a new sample.
cycles: usize, // cycles count when we last provided a new sample.
mse: bool,
@ -114,7 +114,7 @@ impl SoundController {
audio_device: audio_device,
sample_rate_to_cpu_freq: 12345,
last_sample_cycles: 0,
cycles: 0,
mse: false,
left_volume: 0,
left_sqr1: false,
@ -286,7 +286,7 @@ impl SoundController {
self.resampler.in_freq = self.sample_rate;
}
self.cycles_per_sample = 512 >> resolution;
},
}
_ => {
// println!(
@ -321,9 +321,10 @@ impl SoundController {
}
}
pub fn update(&mut self, cycles: usize) {
while cycles - self.last_sample_cycles >= self.cycles_per_sample {
self.last_sample_cycles += self.cycles_per_sample;
pub fn update(&mut self, cycles: usize, cycles_to_next_event: &mut usize) {
self.cycles += cycles;
while self.cycles >= self.cycles_per_sample {
self.cycles -= self.cycles_per_sample;
// time to push a new sample!
@ -341,6 +342,9 @@ impl SoundController {
self.resampler
.push_sample((sample[0], sample[1]), &mut *audio);
}
if self.cycles_per_sample < *cycles_to_next_event {
*cycles_to_next_event = self.cycles_per_sample;
}
}
}

View file

@ -2,6 +2,8 @@ use super::interrupt::{Interrupt, IrqBitmask};
use super::iodev::consts::*;
use super::sysbus::SysBus;
use bit_set::BitSet;
use num::FromPrimitive;
#[derive(Debug)]
@ -42,11 +44,32 @@ impl Timer {
_ => unreachable!(),
}
}
/// updates the timer with 'cycles' amount of cycles, returns the number of times it overflowed
fn update(&mut self, cycles: usize, irqs: &mut IrqBitmask) -> usize {
self.cycles += cycles;
let mut num_overflows = 0;
let freq = self.frequency();
while self.cycles >= freq {
self.cycles -= freq;
self.data = self.data.wrapping_add(1);
if self.data == 0 {
if self.ctl.irq_enabled() {
irqs.add_irq(self.irq);
}
self.data = self.initial_data;
num_overflows += 1;
}
}
num_overflows
}
}
#[derive(Debug)]
pub struct Timers {
timers: [Timer; 4],
running_timers: BitSet,
pub trace: bool,
}
@ -67,6 +90,7 @@ impl Timers {
pub fn new() -> Timers {
Timers {
timers: [Timer::new(0), Timer::new(1), Timer::new(2), Timer::new(3)],
running_timers: BitSet::with_capacity(4),
trace: false,
}
}
@ -75,6 +99,12 @@ impl Timers {
let old_enabled = self[id].ctl.enabled();
self[id].ctl.0 = value;
let new_enabled = self[id].ctl.enabled();
let cascade = self.timers[id].ctl.cascade();
if new_enabled && !cascade {
self.running_timers.insert(id);
} else {
self.running_timers.remove(id);
}
if self.trace && old_enabled != new_enabled {
println!(
"TMR{} {}",
@ -127,31 +157,16 @@ impl Timers {
}
}
fn update_timer(&mut self, id: usize, cycles: usize, sb: &mut SysBus, irqs: &mut IrqBitmask) {
pub fn update(&mut self, cycles: usize, sb: &mut SysBus, irqs: &mut IrqBitmask) {
for id in self.running_timers.iter() {
if !self.timers[id].ctl.cascade() {
let timer = &mut self.timers[id];
timer.cycles += cycles;
let mut num_overflows = 0;
let freq = timer.frequency();
while timer.cycles >= freq {
timer.cycles -= freq;
timer.data = timer.data.wrapping_add(1);
if timer.data == 0 {
if self.trace {
println!("TMR{} overflown!", id);
}
if timer.ctl.irq_enabled() {
irqs.add_irq(timer.irq);
}
timer.data = timer.initial_data;
num_overflows += 1;
}
}
let num_overflows = timer.update(cycles, irqs);
if num_overflows > 0 {
if id != 3 {
let next_timer = &mut self.timers[id + 1];
if next_timer.ctl.cascade() {
self.update_timer(id + 1, num_overflows, sb, irqs);
next_timer.update(num_overflows, irqs);
}
}
if id == 0 || id == 1 {
@ -161,12 +176,6 @@ impl Timers {
}
}
}
pub fn step(&mut self, cycles: usize, sb: &mut SysBus, irqs: &mut IrqBitmask) {
for i in 0..4 {
if self.timers[i].ctl.enabled() && !self.timers[i].ctl.cascade() {
self.update_timer(i, cycles, sb, irqs);
}
}
}
}

View file

@ -15,6 +15,7 @@ extern crate bit;
extern crate bitfield;
#[macro_use]
extern crate bitflags;
extern crate bit_set;
extern crate byteorder;