diff --git a/src/device/esp/esp32s3.S b/src/device/esp/esp32s3.S index 6566e3f342..eada22beb8 100644 --- a/src/device/esp/esp32s3.S +++ b/src/device/esp/esp32s3.S @@ -343,6 +343,66 @@ call_start_cpu0: // If main returns, loop forever. 1: j 1b +// ----------------------------------------------------------------------- +// APP CPU entry point. +// ----------------------------------------------------------------------- +.section .text.call_start_cpu1 + .literal_position + .align 4 +.Lstack1_top_addr: + .long _stack1_top +.Lrun_core1_entry_addr: + .long tinygo_runCore1 +.Lvector_table_addr_cpu1: + .long _vector_table + +.global call_start_cpu1 +call_start_cpu1: + // CPU1 starts from ROM with no stack contract that TinyGo can rely on. + // Repeat the CPU-local windowed-ABI setup from CPU0, then enter Go. + rsr.ps a2 + movi a3, ~(PS_WOE) + and a2, a2, a3 + wsr.ps a2 + rsync + + rsr.windowbase a2 + ssl a2 + movi a2, 1 + sll a2, a2 + wsr.windowstart a2 + rsync + + l32r a1, .Lstack1_top_addr + + rsr.ps a2 + movi a3, PS_WOE + or a2, a2, a3 + wsr.ps a2 + rsync + + movi a2, 1 + wsr.cpenable a2 + rsync + + l32r a8, .Lvector_table_addr_cpu1 + wsr.vecbase a8 + rsync + + rsr.ps a2 + movi a3, ~0x1F + and a2, a2, a3 + movi a3, 0x20 + or a2, a2, a3 + wsr.ps a2 + rsync + + mov a5, a1 + l32r a4, .Lrun_core1_entry_addr + callx4 a4 + +1: j 1b + // ----------------------------------------------------------------------- // tinygo_scanCurrentStack — Spill all Xtensa register windows to the // stack, then call tinygo_scanstack(sp) so the conservative GC can diff --git a/src/internal/task/task_stack_esp32.go b/src/internal/task/task_stack_esp32.go index 06613b296c..21510870ea 100644 --- a/src/internal/task/task_stack_esp32.go +++ b/src/internal/task/task_stack_esp32.go @@ -1,4 +1,4 @@ -//go:build scheduler.tasks && (esp32 || esp32s3) +//go:build (scheduler.tasks || scheduler.cores) && (esp32 || esp32s3) package task @@ -12,10 +12,12 @@ package task // https://0x04.net/~mwk/doc/xtensa.pdf import ( + _ "unsafe" "unsafe" ) -var systemStack uintptr +//go:linkname runtime_systemStackPtr runtime.systemStackPtr +func runtime_systemStackPtr() *uintptr // calleeSavedRegs is the list of registers that must be saved and restored when // switching between tasks. Also see task_stack_esp8266.S that relies on the @@ -60,19 +62,20 @@ func (s *state) archInit(r *calleeSavedRegs, fn uintptr, args unsafe.Pointer) { } func (s *state) resume() { - swapTask(s.sp, &systemStack) + swapTask(s.sp, runtime_systemStackPtr()) } func (s *state) pause() { - newStack := systemStack - systemStack = 0 + systemStackPtr := runtime_systemStackPtr() + newStack := *systemStackPtr + *systemStackPtr = 0 swapTask(newStack, &s.sp) } // SystemStack returns the system stack pointer when called from a task stack. // When called from the system stack, it returns 0. func SystemStack() uintptr { - return systemStack + return *runtime_systemStackPtr() } //export tinygo_task_current diff --git a/src/runtime/runtime_esp32s3_cores.go b/src/runtime/runtime_esp32s3_cores.go new file mode 100644 index 0000000000..8582c0d573 --- /dev/null +++ b/src/runtime/runtime_esp32s3_cores.go @@ -0,0 +1,217 @@ +//go:build esp32s3 && scheduler.cores + +package runtime + +import ( + "device" + "device/esp" + "internal/task" + "runtime/interrupt" + "runtime/volatile" + "sync/atomic" + "unsafe" +) + +const numCPU = 2 + +const crosscoreCPUInt = 12 + +const ( + crosscoreReasonWake = 1 << iota + crosscoreReasonGC +) + +var ( + printLock spinLock + schedulerLock spinLock + atomicsLock spinLock + futexLock spinLock +) + +var sleepingCore uint8 = 0xff +var waitingCores uint8 +var cpu1Started atomic.Uint32 +var crosscoreReason [numCPU]atomic.Uint32 +var gcSignalWait volatile.Register8 + +func hasSleepingCore() bool { + return sleepingCore != 0xff +} + +func sleepTicksMulticore(d timeUnit) { + sleepingCore = uint8(currentCPU()) + schedulerLock.Unlock() + sleepTicks(d) + schedulerLock.Lock() + sleepingCore = 0xff +} + +func interruptSleepTicksMulticore(wakeup timeUnit) { + _ = wakeup + schedulerWake() +} + +func schedulerUnlockAndWait() { + core := currentCPU() + waitingCores |= uint8(1 << core) + schedulerLock.Unlock() + device.Asm("waiti 0") + schedulerLock.Lock() + waitingCores &^= uint8(1 << core) +} + +func schedulerWake() { + if waitingCores == 0 { + return + } + core := currentCPU() ^ 1 + if waitingCores&(1<> 13) & 1) +} + +func startSecondaryCores() { + initCrosscoreInterrupt(0) + + esp.RTC_CNTL.SetOPTIONS0_SW_STALL_APPCPU_C0(0) + esp.RTC_CNTL.SetSW_CPU_STALL_SW_STALL_APPCPU_C1(0) + + esp.SYSTEM.SetCORE_1_CONTROL_0_CONTROL_CORE_1_CLKGATE_EN(1) + esp.SYSTEM.SetCORE_1_CONTROL_0_CONTROL_CORE_1_RUNSTALL(0) + esp.SYSTEM.SetCORE_1_CONTROL_0_CONTROL_CORE_1_RESETING(1) + esp.SYSTEM.SetCORE_1_CONTROL_0_CONTROL_CORE_1_RESETING(0) + + etsSetAppCPUBootAddr(uint32(uintptr(unsafe.Pointer(&callStartCPU1)))) + + for i := 0; i < 1000000 && cpu1Started.Load() == 0; i++ { + spinLoopWait() + } +} + +func gcPauseCore(core uint32) { + sendCrosscoreInterrupt(core, crosscoreReasonGC) +} + +func gcSignalCore(core uint32) { + gcSignalWait.Set(1) + sendCrosscoreInterrupt(core, crosscoreReasonGC) +} + +func coreStackTop(core uint32) uintptr { + switch core { + case 0: + return uintptr(unsafe.Pointer(&stackTopSymbol)) + case 1: + return uintptr(unsafe.Pointer(&stack1TopSymbol)) + default: + runtimePanic("unexpected core") + return 0 + } +} + +func spinLoopWait() { + device.Asm("nop") +} + +//export tinygo_runCore1 +func runCore1() { + interruptInit() + initCrosscoreInterrupt(1) + etsSetAppCPUBootAddr(0) + cpu1Started.Store(1) + schedulerLock.Lock() + scheduler(false) + schedulerLock.Unlock() + exit(0) +} + +func initCrosscoreInterrupt(core uint32) { + if core == 0 { + esp.INTERRUPT_CORE0.SetCPU_INTR_FROM_CPU_0_MAP(crosscoreCPUInt) + } else { + esp.INTERRUPT_CORE1.SetCPU_INTR_FROM_CPU_1_MAP(crosscoreCPUInt) + } + intr := interrupt.New(crosscoreCPUInt, crosscoreInterruptHandler) + _ = intr.Enable() +} + +func crosscoreInterruptHandler(interrupt.Interrupt) { + handleCrosscoreInterrupt(currentCPU()) +} + +func sendCrosscoreInterrupt(core uint32, reason uint32) { + crosscoreReason[core].Or(reason) + if core == 0 { + esp.SYSTEM.SetCPU_INTR_FROM_CPU_0(1) + } else { + esp.SYSTEM.SetCPU_INTR_FROM_CPU_1(1) + } +} + +func clearCrosscoreInterrupt(core uint32) { + if core == 0 { + esp.SYSTEM.SetCPU_INTR_FROM_CPU_0(0) + } else { + esp.SYSTEM.SetCPU_INTR_FROM_CPU_1(0) + } +} + +func handleCrosscoreInterrupt(core uint32) { + clearCrosscoreInterrupt(core) + reason := crosscoreReason[core].Swap(0) + if reason&crosscoreReasonGC != 0 { + gcInterruptHandler(core) + } +} + +func gcInterruptHandler(hartID uint32) { + gcScanState.Add(1) + for gcSignalWait.Get() == 0 { + spinLoopWait() + } + gcSignalWait.Set(0) + + scanCurrentStack() + if !task.OnSystemStack() { + markRoots(task.SystemStack(), coreStackTop(hartID)) + } + + gcScanState.Store(1) + for gcSignalWait.Get() == 0 { + spinLoopWait() + } + gcSignalWait.Set(0) + gcScanState.Add(1) +} + +type spinLock struct { + atomic.Uint32 +} + +func (l *spinLock) Lock() { + for !l.CompareAndSwap(0, 1) { + spinLoopWait() + } +} + +func (l *spinLock) Unlock() { + if schedulerAsserts && l.Load() != 1 { + runtimePanic("unlock of unlocked spinlock") + } + l.Store(0) +} + +//go:extern _stack1_top +var stack1TopSymbol [0]uint32 + +//go:extern call_start_cpu1 +var callStartCPU1 [0]uint32 + +//go:linkname etsSetAppCPUBootAddr ets_set_appcpu_boot_addr +func etsSetAppCPUBootAddr(addr uint32) diff --git a/targets/esp32s3.ld b/targets/esp32s3.ld index 6a08f29983..4a2ecab887 100644 --- a/targets/esp32s3.ld +++ b/targets/esp32s3.ld @@ -54,6 +54,9 @@ SECTIONS . = ALIGN(16); . += _stack_size; _stack_top = .; + . = ALIGN(16); + . += _stack_size; + _stack1_top = .; } >DRAM /* Global variables that are mutable and zero-initialized. */ @@ -180,6 +183,7 @@ memset = 0x400011e8; memcpy = 0x400011f4; memmove = 0x40001200; memcmp = 0x4000120c; +ets_set_appcpu_boot_addr = 0x40000720; /* From ESP-IDF: * components/esp_rom/esp32/ld/esp32.rom.libgcc.ld