fix/performance/accuracy: Reimplement sysbus stall cycle counting.

Previously I did not account for gamepak 32-bit accesses being two 16bit (NSEQ+SEQ or SEQ+SEQ) accesses,
This resulted in less cycles being accounted for, resulting in more wasted CPU cycles per frame and was a huge performance hit.

I re-implemented this with look-up-tables also to speed up the cycle
count routing `SysBus::get_cycles` and also account for 32bit gamepak memory access.


Former-commit-id: fe6a9a570c843d40e38971a2a36e6511df1b8894
This commit is contained in:
Michel Heily 2020-04-04 13:50:50 +03:00
parent c4c8163b0e
commit 8f11bebd2b
3 changed files with 212 additions and 98 deletions

View file

@ -46,9 +46,10 @@ impl GameBoyAdvance {
));
let io = IoDevices::new(gpu, sound_controller);
let sysbus = Box::new(SysBus::new(io, bios_rom, gamepak));
let cpu = arm7tdmi::Core::new();
GameBoyAdvance {
let mut gba = GameBoyAdvance {
cpu: cpu,
sysbus: sysbus,
@ -57,7 +58,11 @@ impl GameBoyAdvance {
input_device: input_device,
cycles_to_next_event: 1,
}
};
gba.sysbus.created();
gba
}
pub fn save_state(&self) -> bincode::Result<Vec<u8>> {
@ -76,6 +81,8 @@ impl GameBoyAdvance {
self.sysbus = decoded.sysbus;
self.cycles_to_next_event = 1;
self.sysbus.created();
Ok(())
}

View file

@ -4,6 +4,7 @@ use super::gpu::*;
use super::interrupt::InterruptController;
use super::keypad;
use super::sound::SoundController;
use super::sysbus::SysBusPtr;
use super::timer::Timers;
use super::{Addr, Bus};
@ -29,6 +30,12 @@ pub struct IoDevices {
pub post_boot_flag: bool,
pub waitcnt: WaitControl, // TODO also implement 4000800
pub haltcnt: HaltState,
// HACK
// my ownership design sucks
#[serde(skip)]
#[serde(default = "SysBusPtr::default")]
sysbus_ptr: SysBusPtr,
}
impl IoDevices {
@ -43,8 +50,14 @@ impl IoDevices {
haltcnt: HaltState::Running,
keyinput: keypad::KEYINPUT_ALL_RELEASED,
waitcnt: WaitControl(0),
sysbus_ptr: Default::default(),
}
}
pub fn set_sysbus_ptr(&mut self, ptr: SysBusPtr) {
self.sysbus_ptr = ptr;
}
}
impl Bus for IoDevices {
@ -231,7 +244,10 @@ impl Bus for IoDevices {
io.dmac.write_16(channel_id, ofs % 12, value)
}
REG_WAITCNT => io.waitcnt.0 = value,
REG_WAITCNT => {
io.waitcnt.0 = value;
(*io.sysbus_ptr).on_waitcnt_written(io.waitcnt);
}
REG_POSTFLG => io.post_boot_flag = value != 0,
REG_HALTCNT => {
@ -281,7 +297,7 @@ bitfield! {
pub struct WaitControl(u16);
impl Debug;
u16;
sram_wait_control, _: 1, 0;
pub sram_wait_control, _: 1, 0;
pub ws0_first_access, _: 3, 2;
pub ws0_second_access, _: 4, 4;
pub ws1_first_access, _: 6, 5;

View file

@ -1,11 +1,11 @@
use std::fmt;
use std::ops::Add;
use std::ops::{Deref, DerefMut};
use serde::{Deserialize, Serialize};
use super::cartridge::Cartridge;
use super::gpu::{GpuState, VIDEO_RAM_SIZE};
use super::iodev::IoDevices;
use super::gpu::VIDEO_RAM_SIZE;
use super::iodev::{IoDevices, WaitControl};
use super::{Addr, Bus};
pub mod consts {
@ -27,6 +27,19 @@ pub mod consts {
pub const GAMEPAK_WS2_HI: u32 = 0x0D00_0000;
pub const SRAM_LO: u32 = 0x0E00_0000;
pub const SRAM_HI: u32 = 0x0F00_0000;
pub const PAGE_BIOS: usize = (BIOS_ADDR >> 24) as usize;
pub const PAGE_EWRAM: usize = (EWRAM_ADDR >> 24) as usize;
pub const PAGE_IWRAM: usize = (IWRAM_ADDR >> 24) as usize;
pub const PAGE_IOMEM: usize = (IOMEM_ADDR >> 24) as usize;
pub const PAGE_PALRAM: usize = (PALRAM_ADDR >> 24) as usize;
pub const PAGE_VRAM: usize = (VRAM_ADDR >> 24) as usize;
pub const PAGE_OAM: usize = (OAM_ADDR >> 24) as usize;
pub const PAGE_GAMEPAK_WS0: usize = (GAMEPAK_WS0_LO >> 24) as usize;
pub const PAGE_GAMEPAK_WS1: usize = (GAMEPAK_WS1_LO >> 24) as usize;
pub const PAGE_GAMEPAK_WS2: usize = (GAMEPAK_WS2_LO >> 24) as usize;
pub const PAGE_SRAM_LO: usize = (SRAM_LO >> 24) as usize;
pub const PAGE_SRAM_HI: usize = (SRAM_HI >> 24) as usize;
}
use consts::*;
@ -57,23 +70,6 @@ pub enum MemoryAccessWidth {
MemoryAccess32,
}
impl Add<MemoryAccessWidth> for MemoryAccessType {
type Output = MemoryAccess;
fn add(self, other: MemoryAccessWidth) -> Self::Output {
MemoryAccess(self, other)
}
}
#[derive(Debug, Copy, Clone)]
pub struct MemoryAccess(pub MemoryAccessType, pub MemoryAccessWidth);
impl fmt::Display for MemoryAccess {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
write!(f, "{}-Cycle ({:?})", self.0, self.1)
}
}
#[derive(Serialize, Deserialize, Clone, Debug)]
#[repr(transparent)]
pub struct BoxedMemory {
@ -109,6 +105,100 @@ impl Bus for DummyBus {
fn write_8(&mut self, _addr: Addr, _value: u8) {}
}
const CYCLE_LUT_SIZE: usize = 0x10;
#[derive(Serialize, Deserialize, Clone)]
struct CycleLookupTables {
n_cycles32: [usize; CYCLE_LUT_SIZE],
s_cycles32: [usize; CYCLE_LUT_SIZE],
n_cycles16: [usize; CYCLE_LUT_SIZE],
s_cycles16: [usize; CYCLE_LUT_SIZE],
}
impl Default for CycleLookupTables {
fn default() -> CycleLookupTables {
CycleLookupTables {
n_cycles32: [1; CYCLE_LUT_SIZE],
s_cycles32: [1; CYCLE_LUT_SIZE],
n_cycles16: [1; CYCLE_LUT_SIZE],
s_cycles16: [1; CYCLE_LUT_SIZE],
}
}
}
impl CycleLookupTables {
pub fn init(&mut self) {
self.n_cycles32[PAGE_EWRAM] = 6;
self.s_cycles32[PAGE_EWRAM] = 6;
self.n_cycles16[PAGE_EWRAM] = 3;
self.s_cycles16[PAGE_EWRAM] = 3;
self.n_cycles32[PAGE_OAM] = 2;
self.s_cycles32[PAGE_OAM] = 2;
self.n_cycles16[PAGE_OAM] = 1;
self.s_cycles16[PAGE_OAM] = 1;
self.n_cycles32[PAGE_VRAM] = 2;
self.s_cycles32[PAGE_VRAM] = 2;
self.n_cycles16[PAGE_VRAM] = 1;
self.s_cycles16[PAGE_VRAM] = 1;
self.n_cycles32[PAGE_PALRAM] = 2;
self.s_cycles32[PAGE_PALRAM] = 2;
self.n_cycles16[PAGE_PALRAM] = 1;
self.s_cycles16[PAGE_PALRAM] = 1;
}
pub fn update_gamepak_waitstates(&mut self, waitcnt: WaitControl) {
static S_GAMEPAK_NSEQ_CYCLES: [usize; 4] = [4, 3, 2, 8];
static S_GAMEPAK_WS0_SEQ_CYCLES: [usize; 2] = [2, 1];
static S_GAMEPAK_WS1_SEQ_CYCLES: [usize; 2] = [4, 1];
static S_GAMEPAK_WS2_SEQ_CYCLES: [usize; 2] = [8, 1];
let ws0_first_access = waitcnt.ws0_first_access() as usize;
let ws1_first_access = waitcnt.ws1_first_access() as usize;
let ws2_first_access = waitcnt.ws2_first_access() as usize;
let ws0_second_access = waitcnt.ws0_second_access() as usize;
let ws1_second_access = waitcnt.ws1_second_access() as usize;
let ws2_second_access = waitcnt.ws2_second_access() as usize;
// update SRAM access
let sram_wait_cycles = 1 + S_GAMEPAK_NSEQ_CYCLES[waitcnt.sram_wait_control() as usize];
self.n_cycles32[PAGE_SRAM_LO] = sram_wait_cycles;
self.n_cycles32[PAGE_SRAM_LO] = sram_wait_cycles;
self.n_cycles16[PAGE_SRAM_HI] = sram_wait_cycles;
self.n_cycles16[PAGE_SRAM_HI] = sram_wait_cycles;
self.s_cycles32[PAGE_SRAM_LO] = sram_wait_cycles;
self.s_cycles32[PAGE_SRAM_LO] = sram_wait_cycles;
self.s_cycles16[PAGE_SRAM_HI] = sram_wait_cycles;
self.s_cycles16[PAGE_SRAM_HI] = sram_wait_cycles;
// update both pages of each waitstate
for i in 0..2 {
self.n_cycles16[PAGE_GAMEPAK_WS0 + i] = 1 + S_GAMEPAK_NSEQ_CYCLES[ws0_first_access];
self.s_cycles16[PAGE_GAMEPAK_WS0 + i] = 1 + S_GAMEPAK_WS0_SEQ_CYCLES[ws0_second_access];
self.n_cycles16[PAGE_GAMEPAK_WS1 + i] = 1 + S_GAMEPAK_NSEQ_CYCLES[ws1_first_access];
self.s_cycles16[PAGE_GAMEPAK_WS1 + i] = 1 + S_GAMEPAK_WS1_SEQ_CYCLES[ws1_second_access];
self.n_cycles16[PAGE_GAMEPAK_WS2 + i] = 1 + S_GAMEPAK_NSEQ_CYCLES[ws2_first_access];
self.s_cycles16[PAGE_GAMEPAK_WS2 + i] = 1 + S_GAMEPAK_WS2_SEQ_CYCLES[ws2_second_access];
// ROM 32bit accesses are split into two 16bit accesses 1N+1S
self.n_cycles32[PAGE_GAMEPAK_WS0 + i] =
self.n_cycles16[PAGE_GAMEPAK_WS0 + i] + self.s_cycles16[PAGE_GAMEPAK_WS0 + i];
self.n_cycles32[PAGE_GAMEPAK_WS1 + i] =
self.n_cycles16[PAGE_GAMEPAK_WS1 + i] + self.s_cycles16[PAGE_GAMEPAK_WS1 + i];
self.n_cycles32[PAGE_GAMEPAK_WS2 + i] =
self.n_cycles16[PAGE_GAMEPAK_WS2 + i] + self.s_cycles16[PAGE_GAMEPAK_WS2 + i];
self.s_cycles32[PAGE_GAMEPAK_WS0 + i] = 2 * self.s_cycles16[PAGE_GAMEPAK_WS0 + i];
self.s_cycles32[PAGE_GAMEPAK_WS1 + i] = 2 * self.s_cycles16[PAGE_GAMEPAK_WS1 + i];
self.s_cycles32[PAGE_GAMEPAK_WS2 + i] = 2 * self.s_cycles16[PAGE_GAMEPAK_WS2 + i];
}
}
}
#[derive(Serialize, Deserialize, Clone)]
pub struct SysBus {
pub io: IoDevices,
@ -119,11 +209,50 @@ pub struct SysBus {
pub cartridge: Cartridge,
dummy: DummyBus,
cycle_luts: CycleLookupTables,
pub trace_access: bool,
}
#[repr(transparent)]
#[derive(Clone)]
pub struct SysBusPtr {
ptr: *mut SysBus,
}
impl Default for SysBusPtr {
fn default() -> SysBusPtr {
SysBusPtr {
ptr: std::ptr::null_mut::<SysBus>(),
}
}
}
impl SysBusPtr {
pub fn new(ptr: *mut SysBus) -> SysBusPtr {
SysBusPtr { ptr: ptr }
}
}
impl Deref for SysBusPtr {
type Target = SysBus;
fn deref(&self) -> &Self::Target {
unsafe { &*self.ptr }
}
}
impl DerefMut for SysBusPtr {
fn deref_mut(&mut self) -> &mut Self::Target {
unsafe { &mut *self.ptr }
}
}
impl SysBus {
pub fn new(io: IoDevices, bios_rom: Box<[u8]>, cartridge: Cartridge) -> SysBus {
let mut luts = CycleLookupTables::default();
luts.init();
luts.update_gamepak_waitstates(io.waitcnt);
SysBus {
io: io,
@ -133,10 +262,23 @@ impl SysBus {
cartridge: cartridge,
dummy: DummyBus([0; 4]),
cycle_luts: luts,
trace_access: false,
}
}
/// must be called whenever this object is instanciated
pub fn created(&mut self) {
let ptr = SysBusPtr::new(self as *mut SysBus);
// HACK
self.io.set_sysbus_ptr(ptr.clone());
}
pub fn on_waitcnt_written(&mut self, waitcnt: WaitControl) {
self.cycle_luts.update_gamepak_waitstates(waitcnt);
}
fn map(&self, addr: Addr) -> (&dyn Bus, Addr) {
match addr & 0xff000000 {
BIOS_ADDR => {
@ -209,82 +351,31 @@ impl SysBus {
}
#[inline(always)]
pub fn get_cycles(&self, addr: Addr, access: MemoryAccess) -> usize {
let nonseq_cycles = [4, 3, 2, 8];
let seq_cycles = [2, 1];
pub fn get_cycles(
&self,
addr: Addr,
access: MemoryAccessType,
width: MemoryAccessWidth,
) -> usize {
use MemoryAccessType::*;
use MemoryAccessWidth::*;
let page = (addr >> 24) as usize;
let mut cycles = 0;
// TODO handle EWRAM accesses
match addr & 0xff000000 {
EWRAM_ADDR => match access.1 {
MemoryAccessWidth::MemoryAccess32 => cycles += 6,
_ => cycles += 3,
},
OAM_ADDR | VRAM_ADDR | PALRAM_ADDR => {
match access.1 {
MemoryAccessWidth::MemoryAccess32 => cycles += 2,
_ => cycles += 1,
}
if self.io.gpu.state == GpuState::HDraw {
cycles += 1;
}
}
GAMEPAK_WS0_LO | GAMEPAK_WS0_HI => match access.0 {
MemoryAccessType::NonSeq => match access.1 {
MemoryAccessWidth::MemoryAccess32 => {
cycles += nonseq_cycles[self.io.waitcnt.ws0_first_access() as usize];
cycles += seq_cycles[self.io.waitcnt.ws0_second_access() as usize];
}
_ => {
cycles += nonseq_cycles[self.io.waitcnt.ws0_first_access() as usize];
}
},
MemoryAccessType::Seq => {
cycles += seq_cycles[self.io.waitcnt.ws0_second_access() as usize];
if access.1 == MemoryAccessWidth::MemoryAccess32 {
cycles += seq_cycles[self.io.waitcnt.ws0_second_access() as usize];
}
}
},
GAMEPAK_WS1_LO | GAMEPAK_WS1_HI => match access.0 {
MemoryAccessType::NonSeq => match access.1 {
MemoryAccessWidth::MemoryAccess32 => {
cycles += nonseq_cycles[self.io.waitcnt.ws1_first_access() as usize];
cycles += seq_cycles[self.io.waitcnt.ws1_second_access() as usize];
}
_ => {
cycles += nonseq_cycles[self.io.waitcnt.ws1_first_access() as usize];
}
},
MemoryAccessType::Seq => {
cycles += seq_cycles[self.io.waitcnt.ws1_second_access() as usize];
if access.1 == MemoryAccessWidth::MemoryAccess32 {
cycles += seq_cycles[self.io.waitcnt.ws1_second_access() as usize];
}
}
},
GAMEPAK_WS2_LO | GAMEPAK_WS2_HI => match access.0 {
MemoryAccessType::NonSeq => match access.1 {
MemoryAccessWidth::MemoryAccess32 => {
cycles += nonseq_cycles[self.io.waitcnt.ws2_first_access() as usize];
cycles += seq_cycles[self.io.waitcnt.ws2_second_access() as usize];
}
_ => {
cycles += nonseq_cycles[self.io.waitcnt.ws2_first_access() as usize];
}
},
MemoryAccessType::Seq => {
cycles += seq_cycles[self.io.waitcnt.ws2_second_access() as usize];
if access.1 == MemoryAccessWidth::MemoryAccess32 {
cycles += seq_cycles[self.io.waitcnt.ws2_second_access() as usize];
}
}
},
_ => {}
// TODO optimize out by making the LUTs have 0x100 entries for each possible page ?
if page > 0xF {
// open bus
return 1;
}
match width {
MemoryAccess8 | MemoryAccess16 => match access {
NonSeq => self.cycle_luts.n_cycles16[page],
Seq => self.cycle_luts.s_cycles16[page],
},
MemoryAccess32 => match access {
NonSeq => self.cycle_luts.n_cycles32[page],
Seq => self.cycle_luts.s_cycles32[page],
},
}
cycles
}
}