#include "halPenAndKeys.h"
#include "kernel_int.h"
#include "hostCtl.h"
#include "printf.h"
#include "kernel.h"
#include "atomic.h"
#include "timers.h"
#include "input.h"
#include "entry.h"
#include "slab.h"
#include "heap.h"
#include "list.h"
#include "irqs.h"
#include "mpu.h"
#include "emu.h"
#include "rtc.h"
#include "kal.h"
#include "ral.h"
#include "dal.h"
#include "cpu.h"


/*
	SVC - higest exc - calleable from all levels, not interruptible. can change mNextTask. can change curTask to NULL only
	SysTick - lowest exception, higher than all irqs  - for precise timestamps
	TimerIRQ - higest prio irg for precise timing
	
	other irq here, priority as desired
	
	PendSV - lowest prio - returns only to userspace. the only one who can change curTask to a non-NULL value


	all scheduling is done in SVC mode by being called via an SVC instr
	all context switches done in PendSV, which is triggered for it.

	
*/

#define REG_NO_PARAM				0
#define REG_NO_SP					13
#define REG_NO_LR					14
#define REG_NO_PC					15

#ifdef  BUILD_FOR_THUMB_1

	//v6m has no user mode so copy from user/to user are no-ops

	bool copyFromUser(void* dst, const void* userSrc, uint32_t len)
	{
		memcpy(dst, userSrc, len);
		return true;
	}
	
	bool copyToUser(void* userDst, const void* src, uint32_t len)
	{
		memcpy(userDst, src, len);
		return true;
	}
	
	bool readUserWord(uint32_t* dst, const uint32_t *userPtr)
	{
		*dst = *userPtr;
		return true;
	}
	
	bool writeUserWord(uint32_t* userDst, uint32_t src)
	{
		*userDst = src;
		return true;
	}

#else

	bool __attribute__((naked,noinline,used)) userAccessFaultReturnPath(void)
	{
		asm volatile(
			".syntax unified			\n\t"
			"	movs r0, #0				\n\t"
			"	bx   lr					\n\t"
		:::"memory","r0","r1","r2","r3","r12","lr");
	}
	
	bool __attribute__((naked,noinline,used)) copyFromUser(void* dst, const void* userSrc, uint32_t len)
	{
		bool dummy;		//using this generates useless instructions, but they never run and it nicely silenced gcc's warnings so why not
		
		asm volatile(
			".syntax unified			\n\t"
			"1:							\n\t"
			"	subs  r2, #1			\n\t"
			"	blt   2f				\n\t"
			".globl user_access_3		\n\t"
			"user_access_3:				\n\t"
			"	ldrbt r3, [r1]			\n\t"
			"	strb  r3, [r0], #1		\n\t"
			"	adds  r1, #1			\n\t"
			"	b     1b				\n\t"
			"2:							\n\t"
			"	movs r0, #1				\n\t"
			"	bx   lr					\n\t"
		:"=r"(dummy)::"memory","r0","r1","r2","r3","r12","lr");
		
		return dummy;
	}
	
	bool __attribute__((naked,noinline,used)) copyToUser(void* userDst, const void* src, uint32_t len)
	{
		bool dummy;		//using this generates useless instructions, but they never run and it ncely silenced gcc's warnings so why not
		
		asm volatile(
			".syntax unified			\n\t"
			"1:							\n\t"
			"	subs  r2, #1			\n\t"
			"	blt   2f				\n\t"
			"	ldrb  r3, [r1], #1		\n\t"
			".globl user_access_2		\n\t"
			"user_access_2:				\n\t"
			"	strbt r3, [r0]			\n\t"
			"	adds  r0, #1			\n\t"
			"	b     1b				\n\t"
			"2:							\n\t"
			"	movs r0, #1				\n\t"
			"	bx   lr					\n\t"
		:"=r"(dummy)::"memory","r0","r1","r2","r3","r12","lr");
		
		return dummy;
	}
	
	bool __attribute__((naked,noinline,used)) readUserWord(uint32_t* dst, const uint32_t *userPtr)
	{
		bool dummy;		//using this generates useless instructions, but they never run and it nicely silenced gcc's warnings so why not
		
		asm volatile(
			".syntax unified			\n\t"
			"	tst  r1, #3				\n\t"
			"	bne  1f					\n\t"
			".globl user_access_1		\n\t"
			"user_access_1:				\n\t"
			"	ldrt r2, [r1]			\n\t"
			"	str  r2, [r0]			\n\t"
			"	movs r0, #1				\n\t"
			"	bx   lr					\n\t"
			"1:							\n\t"
			"	movs r0, #0				\n\t"
			"	bx   lr					\n\t"
		:"=r"(dummy)::"memory","r0","r1","r2","r3","r12","lr");
		
		return dummy;
	}
	
	bool __attribute__((naked,noinline,used)) writeUserWord(uint32_t* userDst, uint32_t src)
	{
		bool dummy;		//using this generates useless instructions, but they never run and it ncely silenced gcc's warnings so why not
		
		asm volatile(
			".syntax unified			\n\t"
			"	tst  r0, #3				\n\t"
			"	bne  1f					\n\t"
			".globl user_access_0		\n\t"
			"user_access_0:				\n\t"
			"	strt r1, [r0]			\n\t"
			"	movs r0, #1				\n\t"
			"	bx   lr					\n\t"
			"1:							\n\t"
			"	movs r0, #0				\n\t"
			"	bx   lr					\n\t"
		:"=r"(dummy)::"memory","r0","r1","r2","r3","r12","lr");
		
		return dummy;
	}
#endif

static int32_t __attribute__((pure)) schedGetIpsr(void)
{
	uint32_t curIpsr;
	
	asm("mrs %0, ipsr": "=r"(curIpsr));	//if in thread mode, IPSR will read as zero which is good enough
	
	return curIpsr;
}

bool schedAmInSyscall(void)
{
	return schedGetIpsr() == 16 + SVCall_IRQn;
}

static bool schedAmInPendSV(void)
{
	return schedGetIpsr() == 16 + PendSV_IRQn;
}

void schedAssertCalledInSyscallMode(void)
{
	if (!schedAmInSyscall() && !irqsAreAllOff())
		fatal("Cannot call this func from non-syscall context or with irqs off\n");
}

uint32_t* __attribute__((used)) schedContextSwitchC(uint32_t *excSpFrm, struct CortexPushedRegs *pushedRegs)	//returns new excFrm pointer
{
	static struct TaskContext ctxTmp;	//pendsv will never interrupt pendsv so this is safe and saves a lot of stack
	struct Task * volatile *curTaskP = schedGetCurTaskP(), *curTask = *curTaskP;
	struct Task *nextTask = schedGetNextTask();
	irq_state_t irqSta;
	uint32_t i;
	
	if (!schedAmInPendSV())
		fatal("Cannot context switch outside of PendSV context\n");
	
	if (schedGetLockCount()) {
		if (!*schedGetCurTaskP())
			fatal("Scheduler off and no current task! not ok!\n");
		return excSpFrm;
	}
	
	//save state to a temp place on stack for now (for quick copy out)
	//any interrupts that happen in the meantime will not modify exc state (they can but what sense does it make for an async irq to modify user regs?)
	//and no svcs can happen since we're still here and not returning to user. this means it is safe to save this state here with ints on
	for (i = 0; i <= 3; i++)
		ctxTmp.regs.r0_r3[i] = excSpFrm[i];
	for (i = 4; i <= 7; i++)
		ctxTmp.regs.r4_r11[i - 4] = pushedRegs->regs4_7[i - 4];
	for (i = 8; i <= 11; i++)
		ctxTmp.regs.r4_r11[i - 4] = pushedRegs->regs8_11[i - 8];
	ctxTmp.regs.r12 = excSpFrm[4];
	ctxTmp.regs.sp = (uint32_t)(excSpFrm + 8);
	ctxTmp.regs.lr = excSpFrm[5];
	ctxTmp.regs.pc = excSpFrm[6];
	ctxTmp.regs.sr = excSpFrm[7];
	ctxTmp.tls = TLS;
	
	//keep in mind that curTask can go away anytime, so disable ints while we do the copy to save current context to current task and grab new task
	irqSta = irqsAllOff();
	if (curTask) {
		
		#ifdef KERNEL_SUPPORTS_VFP
			if (curTask->fpuCtx.flags & SCHED_FPU_ENABLED) {
				
				if (curTask->fpuCtx.flags & SCHED_FPU_CTX_FLAGS_TINY_STATE) {
	
					asm volatile("vstmia %0, {s0-s3}"::"r"(curTask->fpuCtx.regs):"memory");
					asm volatile("vmrs %0, FPSCR":"=r"(curTask->fpuCtx.fpscr));
				}
				else {
					
					asm volatile("vstmia %0, {s0-s31}"::"r"(curTask->fpuCtx.regs):"memory");
					asm volatile("vmrs %0, FPSCR":"=r"(curTask->fpuCtx.fpscr));
				}
			}
		#endif
		
		ctxTmp.priv = curTask->ctx.priv;	//we do not read priv, so keep it
		curTask->ctx = ctxTmp;
	}
	if (!nextTask)
		fatal("Unexpectedly no task to switch to!\n");
	
	mpuSetStackGuard((uintptr_t)nextTask->stackLimitAddr);
	ctxTmp = nextTask->ctx;
	*curTaskP = curTask = nextTask;
	
	#ifdef KERNEL_SUPPORTS_VFP
		if (curTask->fpuCtx.flags & SCHED_FPU_ENABLED) {
			
			SCB->CPACR |= 0xf << 20;
			
			if (curTask->fpuCtx.flags & SCHED_FPU_CTX_FLAGS_TINY_STATE) {
	
				asm volatile("vldmia %0, {s0-s3}"::"r"(curTask->fpuCtx.regs):"memory");
				asm volatile("vmsr FPSCR, %0"::"r"(curTask->fpuCtx.fpscr));
			}
			else {
				
				asm volatile("vldmia %0, {s0-s31}"::"r"(curTask->fpuCtx.regs):"memory");
				asm volatile("vmsr FPSCR, %0"::"r"(curTask->fpuCtx.fpscr));
			}
		}
		else
			SCB->CPACR &=~ (0xf << 20);
	#endif
	
	irqsRestoreState(irqSta);
	
	//the new curTask can go away anytime from now on, but its stack canot be freed till the reaper thread runs,
	//so we're safe to use it since we're the only ones who can schedule the reaper thread
	
	excSpFrm = ((uint32_t*)ctxTmp.regs.sp) - 8;
	
	for (i = 4; i <= 7; i++)
		pushedRegs->regs4_7[i - 4] = ctxTmp.regs.r4_r11[i - 4];
	for (i = 8; i <= 11; i++)
		pushedRegs->regs8_11[i - 8] = ctxTmp.regs.r4_r11[i - 4];
	
	pushedRegs->control = ctxTmp.priv ? 0 : 1;	//set priv appropriately (spsel write will be ignored)
	
	TLS = ctxTmp.tls;
	for (i = 0; i <= 3; i++)
		excSpFrm[i] = ctxTmp.regs.r0_r3[i];
	excSpFrm[4] = ctxTmp.regs.r12;
	excSpFrm[5] = ctxTmp.regs.lr;
	excSpFrm[6] = ctxTmp.regs.pc;
	excSpFrm[7] = ctxTmp.regs.sr;

	return excSpFrm;
}

void __attribute__((naked, used)) PendSV_Handler(void)	//garanteed to be entered from thread mode only 
{
	
	#ifdef BUILD_FOR_THUMB_1
		asm volatile(
			".syntax unified					\n\t"
			"	mov     r0, r8					\n\t"
			"	mov     r1, r9					\n\t"
			"	mov     r2, r10					\n\t"
			"	mov     r3, r11					\n\t"
			"	push    {r0-r7, lr}				\n\t"		//push r8..r11, r4-r7, lr (for space)
			"	mrs		r0, psp					\n\t"
			"	mov		r1, sp					\n\t"
			"	bl		schedContextSwitchC		\n\t"
			"	msr		psp, r0					\n\t"		//set thread's SP
			"	pop		{r0-r7}					\n\t"
			"	mov     r8, r0					\n\t"
			"	mov     r9, r1					\n\t"
			"	mov     r10, r2					\n\t"
			"	mov     r11, r3					\n\t"
			"	pop		{r0}					\n\t"		//get "CONTROL" reg
			"	msr		CONTROL, r0				\n\t"
			"	movs	r0, #~0xfffffffd		\n\t"		//always go back to thread mode on process stack
			"	mvns    r0, r0					\n\t"
			"	bx		r0						\n\t"		//we canot load directly to pc for BX
		);
	#else
		asm volatile(
			".syntax unified					\n\t"
			"	mrs		r0, psp					\n\t"
			"	push	{r4-r11, r12}			\n\t"
			"	mov		r1, sp					\n\t"
			"	bl		schedContextSwitchC		\n\t"
			"	pop		{r4-r11, r12}			\n\t"
			"	msr		CONTROL, r12			\n\t"
			"	msr		psp, r0					\n\t"
			"	mov		r0, #0xfffffffd			\n\t"		//always go back to thread mode on process stack
			"	bx		r0						\n\t"		//we canot load directly to pc for BX
		);
	#endif
}

void schedRequestContextSwitch(void)				//if you're not schedPickNextTask(), do not call this!
{
	schedAssertCalledInSyscallMode();
	
	//schedule a pendsv
	asm volatile("":::"memory");
	SCB->ICSR = SCB_ICSR_PENDSVSET_Msk;
	asm volatile("":::"memory");
}

#ifdef KERNEL_SUPPORTS_VFP

	static void schedHandleFpuUsed(void)
	{
		(*schedGetCurTaskP())->fpuCtx.flags |= SCHED_FPU_ENABLED;
		
		//set CPACR for fpu use
		SCB->CPACR |= 0xf << 20;
		
		//make sure the hardware does not try to use extended exception frames (we do not need them), or track fpu use (we do that ourselves)
		FPU->FPCCR = 0;
		
		//clear fpscr
		asm("vmsr FPSCR, %0"::"r"(0));
	}

#endif

void __attribute__((used)) schedHandleUsageFaultNoCp(struct CortexExcFrame *ctx, volatile uint16_t *ufsrAddr, uint32_t ufsrVal)
{
	uint32_t coproc = (*(uint8_t*)(ctx->pc + 3)) & 0x0f;
	
	*ufsrAddr = 0x08;	//clear "NOCP" bit
	
	#ifdef KERNEL_SUPPORTS_VFP
		if (coproc == 10 || coproc == 11) {
			schedHandleFpuUsed();
			return;
		}
	#endif
	
	fatal("Unexpected use of coprocessor %u at 0x%08x (LR is 0x08x, ctx at 0x%08x)\n", coproc, ctx->pc, ctx->lr, ctx);
}

void __attribute__((used,naked)) SVC_Handler(void)
{
	#ifdef BUILD_FOR_THUMB_1
		asm volatile(
			".syntax unified				\n\t"
			"	mov   r1, lr				\n\t"
			"	lsrs  r1, #3				\n\t"
			"	bcs   1f					\n\t"
			"	mrs   r1, msp				\n\t"	//grab the appropriate SP
			"	b     2f					\n\t"
			"1:								\n\t"
			"	mrs   r1, psp				\n\t"
			"2:								\n\t"
			"	ldr   r0, [r1, #0x18]		\n\t"	//grab PC
			"	subs  r0, #2				\n\t"	//point r0 to SVC instr's immediate
			"	ldrb  r0, [r0]				\n\t"	//grab the SVC immediate
			"	mov   r2, r4				\n\t"
			"	ldr   r3, =syscallHandle	\n\t"
			"	bx    r3					\n\t"
			"	.ltorg						\n\t"
		);
	#else
		asm volatile(
			".syntax unified				\n\t"
			"	tst   lr, #4				\n\t"	//see which stack fault was on
			"	ite   eq					\n\t"
			"	mrseq r1, msp				\n\t"	//grab the appropriate SP
			"	mrsne r1, psp				\n\t"
			"	ldr   r0, [r1, #0x18]		\n\t"	//grab PC
			"	ldrb  r0, [r0, #-2]			\n\t"	//grab the SVC immediate
			"	mov   r2, r4				\n\t"
			"	b     syscallHandle			\n\t"
		);
	#endif
}

void schedSwitchToScheduledRegimeByCallingYield(void)
{
	uint32_t tmp;
	
	//we need to switch to PSP, but it is not valid and the PendSV will try to use it.
	//so copy MSP to PSP just this once, then set nPRIV and SPSEL, then request a rescheduling
	//BUT if we do just that, the exc will use our current stack and overwrite the exc frame itself
	//so we'll allocate enough for an exc frame func (by making PSP be *higher* than MSP by a frame)
	asm volatile(
		".syntax unified				\n\t"
		"	add   %0, sp, %2			\n\t"
		"	msr   PSP, %0				\n\t"
		"	msr   CONTROL, %1			\n\t"	//after this we SHOULD be on PSP
		"	isb							\n\t"
		"	movs  r0, %3				\n\t"
		"	svc   %4					\n\t"
		:"=&r"(tmp)
		:"r"(2)	/*SPSEL bit (which we can write now since we ARE in thread mode) */, "I"(sizeof(struct CortexExcFrame)), "I"(SYSC_TASK_YIELD), "I"(KERNEL_SWI_NUM)
	);
	//we do not get here
	__builtin_unreachable();
	while(1);	//make it clear to compiler
}