#ifndef _M0_FAULT_DISPATCH_
#define _M0_FAULT_DISPATCH_

/*

	comes in a few pieces so you can insert your own things in the middle
	do not include more than once or shit will not link...duh

	define three things to include more processing
	
	CODE_TO_GO_TO_ARM
	
		code to jump to your own way to do arm
		 * r0 contains exc frame
		 * r2 contains the ARM pc
		 * r1, r3, r12 are all yours
		 * LR is whatever you need to return from this exception
		 * numeric labels 5 and later are yours
	
	CODE_GOTO_ARM_PC_IN_EXC_VALID
		same as the above, except exc->pc already has the arm PC
		 * BUT, r2 does not!!!
	
	CODE_FOR_UNDEF_32BIT_INSTR
		
		code to run in case of an undefined 32-bit instr
		 * r0 contains exc frame
		 * r1 has first halfword
		 * r2 has second halfword
		 * r3, r12 are all yours
		 * LR is whatever you need to return from this exception
		 * numeric labels 5 and later are yours
	
	CODE_FOR_POSSIBLY_UNDEF_16BIT_INSTR (could also be defined, up to you to check!)
		 * r0 contains exc frame
		 * r1 has the instr (do not clobber it)
		 * r2 has pc
		 * r2, r3, r12 are all yours
		 * LR is whatever you need to return from this exception
		 * numeric labels 5 and later are yours

	EXTRA_ASM_INPUTS
		if you want to use asm inputs, start at 0 and use this variable to define them

*/

//#define ROMRAM_DEBUG			//slows things down but checks all writes (does not check for corruption around them



#define STR2(x)			#x
#define STR(x)			STR2(x)



//mapping
//old	new		use
//r3	r0		exc
//r0	r1		instr16

/* cannot be in ramcode since default memory map disallows execution from usb memory */
void __attribute__((used,naked, section(".data"))) HardFault_Handler_container(void)
{
	asm volatile(
	
		".syntax unified									\n\t"
		".ltorg												\n\t"	//dump any immediates from before so they do not waste our branch range
		
		"maybe_bx_pc:										\n\t"	//r1 is instr, r0 is state
		"	movs  r3, #0x47									\n\t"
		"	lsls  r3, #8									\n\t"
		"	adds  r3, #0x78									\n\t"
		"	cmp   r3, r1									\n\t"
		"	bne   instr16_maybe_udf							\n\t"
		"	lsrs  r2, #2									\n\t"
		"	adds  r2, #1									\n\t"
		"	lsls  r2, #2									\n\t"
		"go_to_raw_arm_addr:								\n\t"
		CODE_TO_GO_TO_ARM
		
		//16 bit instr that is not a "bx pc"
		"instr16_maybe_udf:									\n\t"
		CODE_FOR_POSSIBLY_UNDEF_16BIT_INSTR
		"b	report_some_fault								\n\t"
		
		"instr32_maybe_blx:									\n\t"	//r1 is instr part 1, r0 is state, we know the first instr starts with 11110, so we only need to check the second to verify that it *IS* a blx
		"	mov   r12, r2									\n\t"
		"	ldrh  r2, [r2, #2]								\n\t"
		"	lsrs  r3, r2, #11								\n\t"
		"	cmp   r3, #0x1d									\n\t"
		"	bne   instr32_inval_have_part_2					\n\t"
		
		"instr32_is_blx_imm:								\n\t"
		"	lsls  r1, #21									\n\t"
		"	asrs  r1, #9									\n\t"
		"	lsls  r2, #21									\n\t"
		"	lsrs  r2, #20									\n\t"
		"	adds  r2, r1									\n\t"	//offset
		"	mov   r1, r12									\n\t"	//the PC at call site
		"	adds  r1, #5									\n\t"	//lr
		"	str   r1, [r0, #4 * 5]							\n\t"	//set exce frame's LR
		"	adds  r2, r1									\n\t"
		"	lsrs  r2, #2									\n\t"
		"	lsls  r2, #2									\n\t"	//target address of the BLX
		
		//check if this is an OsCall (a speed optimization)
		"blx_check_for_oscall:								\n\t"
		"	ldr   r1, [r2]									\n\t"	//load instr, on failure, branch is skipped too
		"	movs  r3, #0x0c									\n\t"
		"	ands  r3, r1									\n\t"
		"	beq   go_to_raw_arm_addr						\n\t"	//not oscall
		"	negs  r3, r3									\n\t"	//offset is negative
		"	mov   r12, r3									\n\t"	//save the offset
		"	add   r1, r12									\n\t"	//remove the offset to check instr
		"	ldr   r3, =0xe519c000							\n\t"
		"	cmp   r1, r3									\n\t"
		"	bne   go_to_raw_arm_addr						\n\t"
		"	mov   r3, r9									\n\t"
		"	add   r3, r12									\n\t"
		"	ldr   r3, [r3]									\n\t"
		"	mov   r12, r3									\n\t"	//r12 = [r9, #-table]
		
		//check word #2
		"	ldr   r1, [r2, #4]								\n\t"	//load instr, on failure, branch is skipped too
		#ifdef HAVE_v8M_BASE
			"	movw  r3, #0x00000ffc						\n\t"
		#else	
			"	movs  r3, #0x0f								\n\t"
			"	lsls  r3, #8								\n\t"
			"	adds  r3, #0xfc								\n\t"
		#endif
		"	ands  r3, r1									\n\t"	//funcOffset
		"	add   r12, r3									\n\t"	//pointer to fun caddr
		"	subs  r1, r3									\n\t"
		"	ldr   r3, =0xe59cf000							\n\t"
		"	cmp   r1, r3									\n\t"
		"	bne   go_to_raw_arm_addr						\n\t"
		
		"blx_is_to_os_call:									\n\t"
		"	mov   r2, r12									\n\t"
		"	ldr   r2, [r2]									\n\t"	//os call func addr
		"	lsrs  r1, r2, #1								\n\t"
		"	bcc   go_to_raw_arm_addr						\n\t"	//ok, go interp there
		
		"blx_is_to_oscall_impl_in_thumb:					\n\t"
		"	lsls  r1, #1									\n\t"	//set it to PC and go!
		"	str   r1, [r0, #4 * 6]							\n\t"
		"	bx    lr										\n\t"

		
		"instr32_inval:										\n\t"
		"	ldr   r2, [r0, #4 * 6]							\n\t"
		"	ldrh  r2, [r2, #2]								\n\t"
		"instr32_inval_have_part_2:							\n\t"
		CODE_FOR_UNDEF_32BIT_INSTR
		"b	report_some_fault								\n\t"
		

		//grab the appropriate SP
		".globl HardFault_Handler							\n\t"
		".func HardFault_Handler							\n\t"
		".type HardFault_Handler function					\n\t"
		"HardFault_Handler:									\n\t"
		"	mov   r0, lr									\n\t"
		"	lsrs  r0, #3									\n\t"
		"	bcs   1f										\n\t"
		"	mov   r0, sp									\n\t"
		"	b     2f										\n\t"
		"1:													\n\t"
		"	mrs   r0, psp									\n\t"
		"2:													\n\t"
		
		//check for ARM mode (necessary since our emu-for-write code can only handle thumb)
		"	ldr   r1, [r0, #4 * 7]							\n\t"	//load pushed flags
		"	movs  r3, #1									\n\t"
		"	lsls  r3, #24									\n\t"	//T flag
		"	tst   r1, r3									\n\t"	//check for T bit
		"	bne   not_arm_mode								\n\t"	//if it is set, further testing to be done here
		"	orrs  r1, r3									\n\t"
		"	str   r1, [r0, #4 * 7]							\n\t"	//store into pushed flags
		
		CODE_GOTO_ARM_PC_IN_EXC_VALID
		".ltorg												\n\t"
		
		//to emulate-for-write fast, we must assume that PC points somewhere valid
		//otherwise we'd have to take the penalty of switching to out safe mode, and then wrangling the MPU
		//whereas now we can use "hard fault uses default map" mode
		
		"not_arm_mode:										\n\t"
		"	ldr		r2, [r0, #4 * 6]						\n\t"
		"	ldrh	r1, [r2]								\n\t"
		"	lsrs	r3, r1, #8								\n\t"
		"	add		pc, r3									\n\t"
		"	nop												\n\t"
		".rept 35											\n\t"
		"	b		report_some_fault						\n\t"
		".endr												\n\t"
		"	b		maybe_bx_pc								\n\t"
		".rept 4											\n\t"
		"	b		report_some_fault						\n\t"
		".endr												\n\t"
		"	b		str_2									\n\t"
		"	b		strh_2									\n\t"
		"	b		strb_2									\n\t"
		".rept 5											\n\t"
		"	b		report_some_fault						\n\t"
		".endr												\n\t"
		".rept 4											\n\t"
		"	b		str_1									\n\t"
		".endr												\n\t"
		".rept 4											\n\t"
		"	b		report_some_fault						\n\t"
		".endr												\n\t"
		".rept 4											\n\t"
		"	b		strb_1									\n\t"
		".endr												\n\t"
		".rept 4											\n\t"
		"	b		report_some_fault						\n\t"
		".endr												\n\t"
		".rept 4											\n\t"
		"	b		strh_1									\n\t"
		".endr												\n\t"
		".rept 28											\n\t"
		"	b		report_some_fault						\n\t"
		".endr												\n\t"
		"	b		stmia_r0r1r2r3							\n\t"
		"	b		stmia_r0r1r2r3							\n\t"
		"	b		stmia_r4r5r6r7							\n\t"
		"	b		stmia_r4r5r6r7							\n\t"
		".rept 11											\n\t"
		"	b		report_some_fault						\n\t"
		".endr												\n\t"
		"	b		instr16_maybe_udf						\n\t"
		".rept 4											\n\t"
		"	b		report_some_fault						\n\t"
		".endr												\n\t"
		".rept 4											\n\t"
		"	b		instr32_inval							\n\t"
		".endr												\n\t"
		".rept 4											\n\t"
		"	b		instr32_maybe_blx						\n\t"
		".endr												\n\t"
		".rept 4											\n\t"
		"	b		instr32_inval							\n\t"
		".endr												\n\t"
		
		"report_some_fault_pop_r4lr:						\n\t"
		"	pop		{r4}									\n\t"
		"	pop		{r3}									\n\t"
		"	mov		lr, r3									\n\t"
		"report_some_fault:									\n\t"
		"	movs	r1, #"STR(EXC_m0_CAUSE_UNCLASSIFIABLE)"	\n\t"
		"	movs	r2, #0									\n\t"
		"	ldr		r3, =faultHandlerWithExcFrame			\n\t"
		"	bx		r3										\n\t"
		".ltorg												\n\t"
		
		"str_2:												\n\t"	//r0 is state, r1 is instr
		"	push	{r4, lr}								\n\t"
		"	bl		get_addr_2reg							\n\t"	//r3 is now addr
		"	ldr		r2, mRomRamStart						\n\t"
		"	ldr		r4, mRomRamLen							\n\t"
		"	subs	r2, r3, r2								\n\t"
		"	cmp		r2, r4									\n\t"
		"	bcs		report_some_fault_pop_r4lr				\n\t"
		"	ldr		r2, [r0, #4 * 6]						\n\t"
		"	adds	r2, #2									\n\t"
		"	str		r2, [r0, #4 * 6]						\n\t"
		"	bl		get_rd_0								\n\t"	//r1 is now val
		"	b		write_4b								\n\t"
		
		"strh_2:											\n\t"
		"	push	{r4, lr}								\n\t"
		"	bl		get_addr_2reg							\n\t"	//r3 is now addr
		"	ldr		r2, mRomRamStart						\n\t"
		"	ldr		r4, mRomRamLen							\n\t"
		"	subs	r2, r3, r2								\n\t"
		"	cmp		r2, r4									\n\t"
		"	bcs		report_some_fault_pop_r4lr				\n\t"
		"	ldr		r2, [r0, #4 * 6]						\n\t"
		"	adds	r2, #2									\n\t"
		"	str		r2, [r0, #4 * 6]						\n\t"
		"	bl		get_rd_0								\n\t"	//r1 is now val
		"	b		write_2b								\n\t"
		
		"strb_2:											\n\t"
		"	push	{r4, lr}								\n\t"
		"	bl		get_addr_2reg							\n\t"	//r3 is now addr
		"	ldr		r2, mRomRamStart						\n\t"
		"	ldr		r4, mRomRamLen							\n\t"
		"	subs	r2, r3, r2								\n\t"
		"	cmp		r2, r4									\n\t"
		"	bcs		report_some_fault_pop_r4lr				\n\t"
		"	ldr		r2, [r0, #4 * 6]						\n\t"
		"	adds	r2, #2									\n\t"
		"	str		r2, [r0, #4 * 6]						\n\t"
		"	bl		get_rd_0								\n\t"	//r1 is now val
		"	b		write_1b								\n\t"
		
		"str_1:												\n\t"
		"	push	{r4, lr}								\n\t"
		"	movs	r3, #2									\n\t"
		"	bl		get_addr_with_imm						\n\t"	//r3 is now addr
		"	ldr		r2, mRomRamStart						\n\t"
		"	ldr		r4, mRomRamLen							\n\t"
		"	subs	r2, r3, r2								\n\t"
		"	cmp		r2, r4									\n\t"
		"	bcs		report_some_fault_pop_r4lr				\n\t"
		"	ldr		r2, [r0, #4 * 6]						\n\t"
		"	adds	r2, #2									\n\t"
		"	str		r2, [r0, #4 * 6]						\n\t"
		"	bl		get_rd_0								\n\t"	//r1 is now val
		"	b		write_4b								\n\t"
		
		"strb_1:											\n\t"
		"	push	{r4, lr}								\n\t"
		"	movs	r3, #0									\n\t"
		"	bl		get_addr_with_imm						\n\t"	//r3 is now addr
		"	ldr		r2, mRomRamStart						\n\t"
		"	ldr		r4, mRomRamLen							\n\t"
		"	subs	r2, r3, r2								\n\t"
		"	cmp		r2, r4									\n\t"
		"	bcs		report_some_fault_pop_r4lr				\n\t"
		"	ldr		r2, [r0, #4 * 6]						\n\t"
		"	adds	r2, #2									\n\t"
		"	str		r2, [r0, #4 * 6]						\n\t"
		"	bl		get_rd_0								\n\t"	//r1 is now val
		"	b		write_1b								\n\t"
		
		"strh_1:											\n\t"
		"	push	{r4, lr}								\n\t"
		"	movs	r3, #1									\n\t"
		"	bl		get_addr_with_imm						\n\t"	//r3 is now addr
		"	ldr		r2, mRomRamStart						\n\t"
		"	ldr		r4, mRomRamLen							\n\t"
		"	subs	r2, r3, r2								\n\t"
		"	cmp		r2, r4									\n\t"
		"	bcs		report_some_fault_pop_r4lr				\n\t"
		"	ldr		r2, [r0, #4 * 6]						\n\t"
		"	adds	r2, #2									\n\t"
		"	str		r2, [r0, #4 * 6]						\n\t"
		"	bl		get_rd_0								\n\t"	//r1 is now val
		"	b		write_2b								\n\t"
		
		"stmia_r0r1r2r3:									\n\t"	//r0 is state, r1 is instr, r2 is pc, r3 is "instr >> 8"
		"	push	{r4-r7}									\n\t"
		"	subs	r3, #0xc0								\n\t"
		"	lsls	r3, r3, #2								\n\t"
		"	adds	r3, r0, r3								\n\t"	//r3 points to where base reg is
		"	b		do_stmia								\n\t"
		
		"stmia_r4r5r6r7:									\n\t"	//r0 is state, r1 is instr, r2 is pc, r3 is "instr >> 8"
		"	push	{r4-r7}									\n\t"
		"	subs	r3, #0xc4								\n\t"
		"	lsls	r3, r3, #2								\n\t"
		"	add		r3, sp									\n\t"	//r3 points to where base reg is
		"	b		do_stmia								\n\t"
		
		"write_4b:											\n\t"	//r1 is val, r3 is addr
#ifdef ROMRAM_DEBUG
		"	push	{r1, r3}								\n\t"
#endif
		"	strb	r1, [r3]								\n\t"	//flush cache line
		"	movs	r2, #0x18								\n\t"
		"	lsls	r2, r2, #24								\n\t"
		"	strb	r2, [r2, #0x08]							\n\t"	//use lane replication to disable ssi
		"	ldr		r0, [r2, #0x00]							\n\t"
		"	mov		r12, r0									\n\t"
		"	subs	r0, #0xc0								\n\t"	//minus 0x200
		"	subs	r0, #0xc0								\n\t"
		"	subs	r0, #0x80								\n\t"
		"	str		r0, [r2, #0x00]							\n\t"
		"	movs	r0, #1									\n\t"
		"	str		r0, [r2, #0x08]							\n\t"
		"	movs	r0, #0x38								\n\t"
		"	str		r0, [r2, #0x60]							\n\t"
		"	str		r3, [r2, #0x60]							\n\t"
		"	rev     r1, r1									\n\t"
		"	str		r1, [r2, #0x60]							\n\t"
		"	mov		r3, r12									\n\t"
		"1:													\n\t"
		"	ldr		r0, [r2, #0x28]							\n\t"
		"	lsrs	r0, r0, #1								\n\t"
		"	bcs		1b										\n\t"
		"	strb	r2, [r2, #0x08]							\n\t"	//use lane replication to disable ssi
		"	str		r3, [r2, #0x00]							\n\t"
		"	movs	r0, #1									\n\t"
		"	str		r0, [r2, #0x08]							\n\t"
#ifdef ROMRAM_DEBUG
		"	pop		{r1, r3}								\n\t"
		"	ldr		r2, [r3]								\n\t"
		"	cmp		r1, r2									\n\t"
		"1:													\n\t"
		"	bne		1b										\n\t"
#endif
		"	pop		{r4, pc}								\n\t"
		
		"write_2b:											\n\t"	//r1 is val, r3 is addr
#ifdef ROMRAM_DEBUG
		"	push	{r1, r3}								\n\t"
#endif
		"	strb	r1, [r3]								\n\t"	//flush cache line
		"	movs	r2, #0x18								\n\t"
		"	lsls	r2, r2, #24								\n\t"
		"	strb	r2, [r2, #0x08]							\n\t"	//use lane replication to disable ssi
		"	ldr		r0, [r2, #0x00]							\n\t"
		"	mov		r12, r0									\n\t"
		"	ldr		r4, =0x00100200							\n\t"	//16 bit write, tx only
		"	subs	r0, r0, r4								\n\t"
		"	str		r0, [r2, #0x00]							\n\t"
		"	movs	r0, #1									\n\t"
		"	str		r0, [r2, #0x08]							\n\t"
		"	movs	r0, #0x38								\n\t"
		"	str		r0, [r2, #0x60]							\n\t"
		"	str		r3, [r2, #0x60]							\n\t"
		"	rev16	r1, r1									\n\t"
		"	str		r1, [r2, #0x60]							\n\t"
		"	mov		r3, r12									\n\t"
		"1:													\n\t"
		"	ldr		r0, [r2, #0x28]							\n\t"
		"	lsrs	r0, r0, #1								\n\t"
		"	bcs		1b										\n\t"
		"	strb	r2, [r2, #0x08]							\n\t"	//use lane replication to disable ssi
		"	str		r3, [r2, #0x00]							\n\t"
		"	movs	r0, #1									\n\t"
		"	str		r0, [r2, #0x08]							\n\t"
#ifdef ROMRAM_DEBUG
		"	pop		{r1, r3}								\n\t"
		"	uxth	r1,r1									\n\t"
		"	ldrh	r2, [r3]								\n\t"
		"	cmp		r1, r2									\n\t"
		"1:													\n\t"
		"	bne		1b										\n\t"
#endif
		"	pop		{r4, pc}								\n\t"
		
		"write_1b:											\n\t"	//r1 is val, r3 is addr
#ifdef ROMRAM_DEBUG
		"	push	{r1, r3}								\n\t"
#endif
		"	strb	r1, [r3]								\n\t"	//flush cache line
		"	movs	r2, #0x18								\n\t"
		"	lsls	r2, r2, #24								\n\t"
		"	strb	r2, [r2, #0x08]							\n\t"	//use lane replication to disable ssi
		"	ldr		r0, [r2, #0x00]							\n\t"
		"	mov		r12, r0									\n\t"
		"	ldr		r4, =0x00180200							\n\t"	//8 bit write, tx only
		"	subs	r0, r0, r4								\n\t"
		"	str		r0, [r2, #0x00]							\n\t"
		"	movs	r0, #1									\n\t"
		"	str		r0, [r2, #0x08]							\n\t"
		"	movs	r0, #0x38								\n\t"
		"	str		r0, [r2, #0x60]							\n\t"
		"	str		r3, [r2, #0x60]							\n\t"
		"	str		r1, [r2, #0x60]							\n\t"
		"	mov		r3, r12									\n\t"
		"1:													\n\t"
		"	ldr		r0, [r2, #0x28]							\n\t"
		"	lsrs	r0, r0, #1								\n\t"
		"	bcs		1b										\n\t"
		"	strb	r2, [r2, #0x08]							\n\t"		//use lane replication to disable ssi
		"	str		r3, [r2, #0x00]							\n\t"
		"	movs	r0, #1									\n\t"
		"	str		r0, [r2, #0x08]							\n\t"
#ifdef ROMRAM_DEBUG
		"	pop		{r1, r3}								\n\t"
		"	uxtb	r1,r1									\n\t"
		"	ldrb	r2, [r3]								\n\t"
		"	cmp		r1, r2									\n\t"
		"1:													\n\t"
		"	bne		1b										\n\t"
#endif
		"	pop		{r4, pc}								\n\t"
		
		".ltorg												\n\t"
		
		"j_report_some_fault:								\n\t"
		"	pop		{r4-r7}									\n\t"
		"	b		report_some_fault						\n\t"
		
		"do_stmia:											\n\t"		//r0 is state, r1 is instr, r3 is &base_addr
	
#ifdef ROMRAM_DEBUG
		"	ldr		r2, [r3]								\n\t"
		"	push	{r0, r1, r2, r3}						\n\t"
#endif
		
		"	lsls	r2, r1, #24								\n\t"
		"	beq		j_report_some_fault						\n\t"
		
		"	ldr		r4, [r3]								\n\t"		//addr

		"	ldr		r2, mRomRamStart						\n\t"
		"	ldr		r5, mRomRamLen							\n\t"
		"	subs	r2, r4, r2								\n\t"
		"	cmp		r2, r5									\n\t"
		"	bcs		j_report_some_fault						\n\t"

		"	movs	r2, #0x18								\n\t"
		"	lsls	r2, r2, #24								\n\t"
		
		"	strb	r2, [r2, #0x08]							\n\t"		//use lane replication to disable ssi
		"	ldr		r6, [r2, #0x00]							\n\t"
		"	subs	r7, r6, #0x02							\n\t"		//minus 0x200
		"	subs	r7, #0xff								\n\t"
		"	subs	r7, #0xff								\n\t"
		"	str		r7, [r2, #0x00]							\n\t"
		"	movs	r7, #1									\n\t"
		"	str		r7, [r2, #0x08]							\n\t"
		"	movs	r7, #0x38								\n\t"
		"	str		r7, [r2, #0x60]							\n\t"		//send cmd
		"	nop												\n\t"		//(this delay is needed ... do NOT ask)
		"	str		r4, [r2, #0x60]							\n\t"		//send addr
		
		".macro wrreg,a,getter								\n\t"
		"	lsrs	r1, r1, #1								\n\t"
		"	bcc		2f										\n\t"
		"	\\getter \\a									\n\t"
		"	rev     r5, r5									\n\t"
		"1:													\n\t"
		"	ldr		r7, [r2, #0x28]							\n\t"
		"	lsrs	r7, r7, #2								\n\t"
		"	bcc		1b										\n\t"
		"	str		r5, [r2, #0x60]							\n\t"
		"	adds	r4, #4									\n\t"
		"2:													\n\t"
		".endm												\n\t"
		".macro wrreg_get_L,a								\n\t"
		"	ldr		r5, [r0, #4 * \\a]						\n\t"
		".endm												\n\t"
		".macro wrreg_get_H,a								\n\t"
#ifdef ROMRAM_DEBUG
		"	ldr		r5, [sp, #4*\\a]						\n\t"
#else
		"	ldr		r5, [sp, #4*((\\a) - 4)]				\n\t"
#endif
		".endm												\n\t"
		".macro wrreg_L,a									\n\t"
		"	wrreg	\\a, wrreg_get_L						\n\t"
		".endm												\n\t"
		".macro wrreg_H,a									\n\t"
		"	wrreg	\\a, wrreg_get_H						\n\t"
		".endm												\n\t"
		
		"	wrreg_L	0										\n\t"
		"	wrreg_L	1										\n\t"
		"	wrreg_L	2										\n\t"
		"	wrreg_L	3										\n\t"
		"	wrreg_H	4										\n\t"
		"	wrreg_H	5										\n\t"
		"	wrreg_H	6										\n\t"
		"	wrreg_H	7										\n\t"
		
		"	ldr		r5, [r3]								\n\t"		//get old addr for flushing
		"	str		r4, [r3]								\n\t"		//wbak
		
		//wait for done
		"1:													\n\t"
		"	ldr		r7, [r2, #0x28]							\n\t"
		"	lsrs	r7, r7, #1								\n\t"
		"	bcs		1b										\n\t"
		"	strb	r2, [r2, #0x08]							\n\t"		//use lane replication to disable ssi
		"	str		r6, [r2, #0x00]							\n\t"
		"	movs	r7, #1									\n\t"
		"	str		r7, [r2, #0x08]							\n\t"
		
		"	ldr		r2, [r0, #4 * 6]						\n\t"
		"	adds	r2, #2									\n\t"
		"	str		r2, [r0, #4 * 6]						\n\t"
		
		//flush now (it was found that flushing DURING write was causing the SSI unit to issue spurious word writes
		"1:													\n\t"
		"	stmia	r5!, {r2}								\n\t"
		"	cmp		r5, r4									\n\t"
		"	bne		1b										\n\t"
		

#ifdef ROMRAM_DEBUG
		"	pop		{r0, r1, r2, r3}						\n\t"		//r0 is state, r1 is instr, r2 is initial base addr, r3 is &base_addr
		
		"	movs	r4, #0									\n\t"
		"1:													\n\t"
		"	lsrs	r1, #1									\n\t"
		"	bcc		2f										\n\t"
		"	ldr		r5, [r0, r4]							\n\t"
		"	ldmia	r2!, {r6}								\n\t"
		"	cmp		r5, r6									\n\t"
		"9:													\n\t"
		"	bne		9b										\n\t"
		"2:													\n\t"
		"	adds	r4, #4									\n\t"
		"	cmp		r4, #16									\n\t"
		"	bne		1b										\n\t"
		
		"	movs	r4, #0									\n\t"
		"1:													\n\t"
		"	lsrs	r1, #1									\n\t"
		"	bcc		2f										\n\t"
		"	mov		r5, r4									\n\t"
		"	add		r5, sp									\n\t"
		"	ldr		r5, [r5]								\n\t"
		"	ldmia	r2!, {r6}								\n\t"
		"	cmp		r5, r6									\n\t"
		"9:													\n\t"
		"	bne		9b										\n\t"
		"2:													\n\t"
		"	adds	r4, #4									\n\t"
		"	cmp		r4, #16									\n\t"
		"	bne		1b										\n\t"
		
		//see if addr matches
		"	ldr		r4, [r3]								\n\t"
		"	cmp		r4, r2									\n\t"
		"9:													\n\t"
		"	bne		9b										\n\t"
		
#endif
		"	pop		{r4-r7}									\n\t"
		"	bx		lr										\n\t"
		
		
		".balign 4											\n\t"
		".globl mRomRamStart								\n\t"
		"mRomRamStart:										\n\t"
		"	.word	0x10200000								\n\t"
		".globl mRomRamLen									\n\t"
		"mRomRamLen:										\n\t"
		"	.word	0x00600000								\n\t"
		
		"get_rd_0:											\n\t"		//r1 is instr, r0 is pushed state, on return r1 is value of Rd at position 0 in instr
		"	lsls	r1, r1, #29								\n\t"		//	do rememeber that r4 is at [sp] not in "r4"
		"	lsrs	r1, r1, #27								\n\t"		//2 instrs per case
		"	add		pc, r1									\n\t"
		"	nop												\n\t"
		
		".macro get_rd_0_rL a								\n\t"
		"	ldr		r1, [r0, #4 * \\a]						\n\t"
		"	bx		lr										\n\t"
		".endm												\n\t"
		".macro get_rd_0_rH a								\n\t"
		"	mov		r1, r\\a								\n\t"
		"	bx		lr										\n\t"
		".endm												\n\t"
		
		"	get_rd_0_rL 0									\n\t"
		"	get_rd_0_rL 1									\n\t"
		"	get_rd_0_rL 2									\n\t"
		"	get_rd_0_rL 3									\n\t"
		
		//r4 is special
		"	ldr		r1, [sp]								\n\t"
		"	bx		lr										\n\t"
		
		"	get_rd_0_rH 5									\n\t"
		"	get_rd_0_rH 6									\n\t"
		"	get_rd_0_rH 7									\n\t"
		
		
		"get_addr_with_imm:									\n\t"		//r1 is instr, r0 is pushed state, r3 is shift imm by val, on return r3 is addr
		"	lsls	r2, r1, #21								\n\t"
		"	lsrs	r2, r2, #27								\n\t"
		"	lsls	r2, r3									\n\t"		//r2 is now the imm, properly shifted
		"	movs	r3, #0x38								\n\t"
		"	ands	r3, r1									\n\t"
		"	add		pc, r3									\n\t"		//4 instrs per case
		"	nop												\n\t"
		
		".macro get_addr_with_imm_rL,a						\n\t"
		"	ldr		r3, [r0, #4 * \\a]						\n\t"
		"	adds	r3, r3, r2								\n\t"
		"	bx		lr										\n\t"
		"	nop												\n\t"
		".endm												\n\t"
		".macro get_addr_with_imm_rH,a						\n\t"
		"	adds	r3, r\\a, r2							\n\t"
		"	bx		lr										\n\t"
		"	nop												\n\t"
		"	nop												\n\t"
		".endm												\n\t"
		
		"	get_addr_with_imm_rL 0							\n\t"
		"	get_addr_with_imm_rL 1							\n\t"
		"	get_addr_with_imm_rL 2							\n\t"
		"	get_addr_with_imm_rL 3							\n\t"
		"	get_addr_with_imm_rH 4							\n\t"
		"	get_addr_with_imm_rH 5							\n\t"
		"	get_addr_with_imm_rH 6							\n\t"
		"	get_addr_with_imm_rH 7							\n\t"
		
		
		"get_addr_2reg:										\n\t"		//r1 is instr, r0 is pushed state, on return r3 is addr, r2 is clobbered. [rX, rX] case could be one cycle faster, but then again, who does that?
		"	lsls	r3, r1, #23								\n\t"
		"	lsrs	r3, r3, #26								\n\t"
		"	lsls	r3, r3, #3								\n\t"		//4 instrs per case
		"	add		pc, r3									\n\t"
		"	nop												\n\t"
		
		".macro	get_addr_2reg_rL_rL,a,b						\n\t"
		"	ldr		r3, [r0, #4 * \\a]						\n\t"
		"	ldr		r2, [r0, #4 * \\b]						\n\t"
		"	adds	r3, r3, r2								\n\t"
		"	bx		lr										\n\t"
		".endm												\n\t"
		".macro	get_addr_2reg_rL_rH,a,b						\n\t"
		"	ldr		r3, [r0, #4 * \\a]						\n\t"
		"	adds	r3, r3, r\\b							\n\t"
		"	bx		lr										\n\t"
		"	nop												\n\t"
		".endm												\n\t"
		".macro	get_addr_2reg_rH_rH,a,b						\n\t"
		"	adds	r3, r\\a, r\\b							\n\t"
		"	bx		lr										\n\t"
		"	nop												\n\t"
		"	nop												\n\t"
		".endm												\n\t"
		".macro get_addr_2reg_rL,a							\n\t"
		"	get_addr_2reg_rL_rL \\a 0						\n\t"
		"	get_addr_2reg_rL_rL \\a 1						\n\t"
		"	get_addr_2reg_rL_rL \\a 2						\n\t"
		"	get_addr_2reg_rL_rL \\a 3						\n\t"
		"	get_addr_2reg_rL_rH \\a 4						\n\t"
		"	get_addr_2reg_rL_rH \\a 5						\n\t"
		"	get_addr_2reg_rL_rH \\a 6						\n\t"
		"	get_addr_2reg_rL_rH \\a 7						\n\t"
		".endm												\n\t"
		".macro get_addr_2reg_rH,a							\n\t"
		"	get_addr_2reg_rL_rH 0 \\a						\n\t"
		"	get_addr_2reg_rL_rH 1 \\a						\n\t"
		"	get_addr_2reg_rL_rH 2 \\a						\n\t"
		"	get_addr_2reg_rL_rH 3 \\a						\n\t"
		"	get_addr_2reg_rH_rH 4 \\a						\n\t"
		"	get_addr_2reg_rH_rH 5 \\a						\n\t"
		"	get_addr_2reg_rH_rH 6 \\a						\n\t"
		"	get_addr_2reg_rH_rH 7 \\a						\n\t"
		".endm												\n\t"
		"	get_addr_2reg_rL 0								\n\t"
		"	get_addr_2reg_rL 1								\n\t"
		"	get_addr_2reg_rL 2								\n\t"
		"	get_addr_2reg_rL 3								\n\t"
		"	get_addr_2reg_rH 4								\n\t"
		"	get_addr_2reg_rH 5								\n\t"
		"	get_addr_2reg_rH 6								\n\t"
		"	get_addr_2reg_rH 7								\n\t"
		
		
		
		
		
		
		
		
	
		
		CODE_AT_END_EXTRA
		
		".ltorg												\n\t"
		:
		: EXTRA_ASM_INPUTS
		: "cc", "memory", "r0", "r1", "r2", "r3", "r12" //yes gcc needs this list...
	);
}



#endif
