#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#include <stdio.h>
#include "kernel_int.h"
#include "atomic.h"
#include "kernel.h"
#include "printf.h"
#include "irqs.h"
#include "emu.h"

//XXX: TODO: we need to implement DSP ext to run sony code
//XXX: TODO: The linker stub code uses pointer below sp. luckily it is in arm mode, but still not safe on cortex :(


#define ARM_SR_BIT_N					0x80000000UL
#define ARM_SR_BIT_Z					0x40000000UL
#define ARM_SR_BIT_C					0x20000000UL
#define ARM_SR_BIT_V					0x10000000UL
#define ARM_SR_BIT_Q					0x08000000UL		//XXX: note, due to asm bits (search for "wholeSrOut") this bit will get lost unless we preserve it. we do not
#define ARM_SR_BITS_APP					(ARM_SR_BIT_N | ARM_SR_BIT_Z | ARM_SR_BIT_C | ARM_SR_BIT_V | ARM_SR_BIT_Q)

#define ARM_V5_SR_USER_MODE_BITS		0x00000010UL

#define REG_NO_SP		13
#define REG_NO_LR		14
#define REG_NO_PC		15


#define NUM_EMU_CONTEXTS		4



struct EmuContext {
	struct EmuCpuState cpu;		//order used in ASM code
	uint32_t stack[16];		//can be smaller later but biggger for now since we print from in here, must be last in struct
};

static struct EmuContext mContexts[NUM_EMU_CONTEXTS];



struct EmuContext* emuGetFreeContext(void)		//atomically grabs one (by setting Pc to a nonzero value: 1) . when done, free it by setting its PC to zero
{
	uint32_t i, prevPcVal, storeFailed;
	
	for (i = 0; i < NUM_EMU_CONTEXTS; i++) {
		
		if (atomicTestAndSet32(&mContexts[i].cpu.regs[REG_NO_PC], 0, 1))
			return mContexts + i;
	}
	
	return NULL;
}

static void __attribute__((naked)) emuAsmEntry(struct EmuContext *ctx, uint32_t sp)
{
		asm volatile(
			".syntax unified					\n\t"
			
	#ifdef BUILD_FOR_THUMB_1
			"	mov   r2, sp					\n\t"		//save sp
			"	str	  r2, [r0, #0x4 * 13]		\n\t"
			"	mov   sp, r1					\n\t"		//replace SP
			"	mov   r1, r0					\n\t"		//get pointer to r4 in context
			"	adds  r1, #16					\n\t"
			"	stmia r1!, {r4-r7}				\n\t"		//save most other regs
			"	mov   r4, r8					\n\t"
			"	mov   r5, r9					\n\t"
			"	mov   r6, r10					\n\t"
			"	mov   r7, r11					\n\t"
			"	stmia r1!, {r4-r7}				\n\t"		//save most other regs
			"	bl    emuCpuRun					\n\t"		//call the asm handler
	#else
			"	str	  sp, [r0, #0x4 * 13]		\n\t"		//save sp
			"	mov   sp, r1					\n\t"		//replace SP
			"	adds  r1, r0, #16				\n\t"		//get pointer to r4 in context
			"	stmia r1, {r4-r11}				\n\t"		//save most other regs
			"	bl    emuCpuRun					\n\t"		//call the asm handler
	#endif
	
//the body of was used to be emuAsmExit() follows here and MUST to allow the above code ot use "BL" to call emuCpuRun and return here

	#ifdef BUILD_FOR_THUMB_1
	
			"	ldr   r1, [r0, #0x04 * 13]		\n\t"		//load SP
			"	mov   sp, r1					\n\t"		//use it
			
			"	ldmia r0!, {r1-r4}				\n\t"		//load what will be our r0-r3
			"	ldr   r5, [r0, #0x04 * 11]		\n\t"		//load what will be our pc
			"	push  {r1-r5}					\n\t"		//push them to stack in such order that we can pop {r0-r3, pc}
			
			"	ldr   r1, [r0, #0x04 * 12]		\n\t"		//load what will be our SR
			"	ldr   r2, [r0, #0x04 * 8]		\n\t"		//load what will be our R12
			"	ldr   r3, [r0, #0x04 * 10]		\n\t"		//load what will be our LR
			"	push  {r1-r3}					\n\t"		//push them to stack in such order that we can pop and set them as if pop {sr, r12, lr}
		
			"	ldmia r0!, {r4-r7}				\n\t"		//for real load our r4-r7
			"	ldmia r0!, {r1-r2}				\n\t"		//load what will be r8,r9
			"	mov   r8, r1					\n\t"		//put them into place
			"	mov   r9, r2					\n\t"
			"	ldmia r0!, {r1-r2}				\n\t"		//load what will be r10,r11
			"	mov   r10, r1					\n\t"		//put them into place
			"	mov   r11, r2					\n\t"
		
			"	movs  r1, #0					\n\t"		//get a zero
			"	str   r1, [r0, #0x4 * 3]		\n\t"		//zero out the context's PC to mark it as free
		#ifdef EXPLICIT_EMU_CTX
			"	movs  r0, #0					\n\t"
			"	bl    schedSetCurThreadEmuCtx	\n\t"		//set cur thread's context to NULL
		#endif
			"	pop   {r0-r2}					\n\t"		//get sr, r12, lr
			"	msr   APSR_nzcvq, r0			\n\t"		//put them into place
			"	mov   r12, r1					\n\t"
			"	mov   lr, r2					\n\t"
			"	pop   {r0-r3, pc}				\n\t"		//load r0-r3 and jump to proper pc
		
	#else
	
			"	mov   lr, r0					\n\t"		//stash context into LR
			"	ldmia lr!, {r0-r12}				\n\t"		//load r0-r12
			"	ldr   sp, [lr]					\n\t"		//load SP
			"	add   lr, #4					\n\t"		//cannot use postincrement on prev instr since that makes it unpredictable if irqs are on
			"	push  {r0-r6,r12,lr}			\n\t"		//now on real stack we can push some regs (some callee-saved one and the rest to cover for what func call may clobber). lr is just to make space to store pc
			"	ldmia lr, {r4,r5,r6}			\n\t"		//grab {.LR,.PC,.SR} into {r4,r5,r6}, leave LR pointing to .LR
			"	str   r5, [sp, #0x20]			\n\t"		//store PC in proper place
			"	movs  r0, #0					\n\t"		//get a zero
			"	str   r0, [lr, #0x4]			\n\t"		//zero out the context's PC to mark it as free
		#ifdef EXPLICIT_EMU_CTX
			"	bl    schedSetCurThreadEmuCtx	\n\t"		//set cur thread's context to NULL
		#endif
			"	mov   lr, r4					\n\t"		//set LR
			"	msr   APSR_nzcvq, r6			\n\t"		//restore sr
			"	pop   {r0-r6,r12,pc}			\n\t"		//and we're out!
	#endif
			".ltorg								\n\t"
		);
}

void __attribute__((used)) emuEnterArmEmulationEmulatedCores(struct CortexExcFrame *exc)		//not used for jit cores
{
	struct EmuContext *ctx = emuGetFreeContext();
	
	if (!ctx)
		fatal("Cannot get a free emulation context!\n");
	
	ctx->cpu.regs[0] = exc->r0;
	ctx->cpu.regs[1] = exc->r1;
	ctx->cpu.regs[2] = exc->r2;
	ctx->cpu.regs[3] = exc->r3;
	ctx->cpu.regs[12] = exc->r12;
	ctx->cpu.regs[REG_NO_LR] = exc->lr;
	ctx->cpu.regs[REG_NO_PC] = exc->pc;
	ctx->cpu.sr = exc->sr & ARM_SR_BITS_APP;	//do not leak v7 bits into arm SR
	
	//return out of HardFault context into whatever mode we were in
	exc->r0 = (uint32_t)ctx;
	exc->r1 = (uint32_t)(ctx->stack + sizeof(ctx->stack) / sizeof(*ctx->stack));
	exc->pc = ((uint32_t)&emuAsmEntry) &~ 1;
}

//MUST be called with LR = exc_return value (that is directly from exc handler)
void __attribute__((used)) emuThumbBlxImmHandler(struct CortexExcFrame *exc, volatile uint16_t *ufsrAddr, uint32_t ufsrVal)
{
	uint32_t ofst, newPc, *instrs, instr1, r12val, instr2;
	uint16_t* pc = (uint16_t*)exc->pc;
	uint32_t firstHalf = pc[0];
	uint32_t secondHalf = pc[1];
	
	
	*ufsrAddr = 0x0001;			//clear UFSR.UNDEFINSTR (caller checked that it was set)
	
	//no matter what, LR needs to be set
	exc->lr = exc->pc + 5;
	
	//these weird unions & bitfields are what is needed for GCC to properly emit UBFS/SBFX/BFI instructions that are most efficient for this code
	union {
		int32_t val;
		struct {
			uint32_t zero	: 1;
			uint32_t low11	: 11;
			uint32_t mid10	: 10;
			uint32_t i2		: 1;
			uint32_t i1		: 1;
			uint32_t unused	: 8;
		};
		struct {
			int32_t unused1	: 24;
			int32_t s		: 8;
		};
	} dst = {0,};
	union {
		uint16_t val;
		struct {
			uint16_t low10		: 10;
			uint16_t unused		: 1;
			uint16_t reserved	: 5;
		};
		struct {
			int16_t unused1		: 10;
			int16_t s			: 1;
			int16_t unused2		: 5; 
		};
	} srcFirst = {.val = firstHalf,};
	union {
		uint16_t val;
		struct {
			uint16_t low11		: 11;
			uint16_t j2			: 1;
			uint16_t unused1	: 1;
			uint16_t j1			: 1;
			uint16_t unused2	: 2;
		};
	} srcSecond = {.val = secondHalf, };
	
	//handle S (convert J1&J2 into I1&I2)
	if (!srcFirst.s)
		srcSecond.val ^= 0x2800;
	
	dst.low11 = srcSecond.low11;
	dst.mid10 = srcFirst.low10;
	dst.i2 = srcSecond.j2;
	dst.i1 = srcSecond.j1;
	dst.s = srcFirst.s;
	
	ofst = dst.val;
	
	//calculate new pc
	newPc = (exc->pc + 4 + ofst) &~ 3;
	
	//shortcut for BLX direct to R9-call (this is purely for speed and not required)
	
	instrs = (uint32_t*)newPc;
	instr1 = *instrs++;
	
	if ((instr1 & 0xFFFFFFF3) == 0xE519C000) {
		
		instr2 = *instrs++;
		
		if ((instr2 & 0xFFFFF003) == 0xE59CF000) {
		
			asm(
			#ifdef BUILD_FOR_THUMB_1
				"mov %0, r9					\n\t"
				"ldr %0, [r0, %2]			\n\t"
			#else
				"ldr %0, [r9, %2]			\n\t"
			#endif
				"ldr %1, [%0, %3]			\n\t"
				:"=&r"(r12val),"=r"(newPc)
				:"r"(-(instr1 & 0x0F)),"r"(instr2 & 0xFFF)
			);
			
			////be 100% correct: clobber r12 here as native code would
			////this could be removed for speed as ABI says it shouldnt matter
			exc->r12 = r12val;
			
			//if target is thumb, return to it directly
			if (newPc & 1) {
				exc->pc = newPc &~ 1;
				//no need to set T flag in SR - it is still set since we came from BLX
				return;
			}
			//else we'll just go out with newPc already set as it should be
		}
	}
	
	//write PC
	exc->pc = newPc;
	//exc->sr &=~ CORTEX_SR_FLAG_T;	//not needed since emuEnterArmEmulation won't check and we want it set to OUR code can run, you know...
	
	emuEnterArmEmulationEmulatedCores(exc);
}

#ifdef BUILD_FOR_THUMB_1

	#define CODE_GOTO_ARM_PC_IN_EXC_VALID																												\
		"	ldr   r1, =emuEnterArmEmulationEmulatedCores	\n\t"	/* enter emulation */																\
		"	bx    r1										\n\t"

	#define CODE_TO_GO_TO_ARM																															\
		"	str   r2, [r0, #0x18]							\n\t"	/* store addr in exc frame since that is expected */								\
		CODE_GOTO_ARM_PC_IN_EXC_VALID

	#define CODE_FOR_UNDEF_32BIT_INSTR					/* nothing */
	
	#define CODE_FOR_POSSIBLY_UNDEF_16BIT_INSTR			/* nothing */
	
	#define CODE_AT_END_EXTRA							/* nothing */
	#define EXTRA_ASM_INPUTS							/* nothing */

	#ifdef CUSTOM_RP2040_FAULT_DISPATCHER
		#include "m0FaultDispatch_rp2040_ROMRAM.h"
	#else
		#include "m0FaultDispatch_debug.h"
	#endif
	
#else
	
	void __attribute__((used,naked)) UsageFault_Handler(void)
	{
		asm volatile(
			"	tst    lr, #4						\n\t"	//see which stack fault was on
			"	ite    eq							\n\t"
			"	mrseq  r0, msp						\n\t"	//grab the appropriate SP
			"	mrsne  r0, psp						\n\t"
			"	ldr    r3, [r0, #0x1c]				\n\t"	//get SR
			"	lsrs   r1, r3, #25					\n\t"	//test for T flag (leaving C flag set if T was set)
			"	ldr    r1, =0xE000ED2A				\n\t"	//UFSR address
			"	ldrh   r2, [r1]						\n\t"	//get UFSR
			"	bcs    5f							\n\t"	//anything after this is handling the T bit being clear
			"	orr    r3, %0						\n\t"	//set T bit in the val we loaded from stacked SR
			"	movs   r2, #2						\n\t"
			"	str    r3, [r0, #0x1c]				\n\t"	//set stacked sr
			"	strh   r2, [r1]						\n\t"	//clear UFSR.INVSTATE
			"	b      emuEnterArmEmulationEmulatedCores	\n\t"
			
			".ltorg									\n\t"
			
			//there is also a special case - BLX.imm instr is not supported by C-Mx cpus but we see it in PalmOS's thumb mode - we need to catch it here
			//since it is a single instr, we do not bother exiting exc mode for it, handle it and go to emulation directly as we know for sure it references arm mode
			
			"5:										\n\t"
			"	lsrs   r3, r2, #1					\n\t"	//test for UFSR.UNDEFINSTR
			"	bcc    1f							\n\t"	//not undef instr - not interested
			"	ldr    r3, [r0, #0x18]				\n\t"	//load PC
			"	ldrb   r12, [r3, #1]				\n\t"	//load high byte of first half of instr
			"	and    r12, #0xF8					\n\t"	//verify it is proper for BLX
			"	cmp    r12, #0xF0					\n\t"
			"	bne    2f							\n\t"
			"	ldrb   r12, [r3, #3]				\n\t"	//load high byte of second half of instr
			"	and    r12, #0xD0					\n\t"	//verify it is proper for BLX
			"	cmp    r12, #0xC0					\n\t"
			"	beq    emuThumbBlxImmHandler		\n\t"	
		
			"2:										\n\t"	//other undefined instrs
			"	push   {lr}							\n\t"
			"   mov    r3, lr						\n\t"
		
			"1:										\n\t"
			"	lsrs   r3, r2, #4					\n\t"	//test for UFSR.NOCP
			"	bcs    schedHandleUsageFaultNoCp	\n\t"
			"	b      faultHandlerWithExcFrame		\n\t"
			
			"3:										\n\t"
			"	pop    {r3}							\n\t"
			"	lsrs   r2, r3, #3					\n\t"	//shift out 0x04 so we know which sp to set
			"	ite    cc							\n\t"
			"	msrcc  msp, r0						\n\t"	//grab the appropriate SP
			"	msrcs  psp, r0						\n\t"
			"	bx     r3							\n\t"
			:
			:"I"(CORTEX_SR_FLAG_T)
		);
	}

#endif
