#ifdef DESKTOP
#include <pthread.h>
#include <stdlib.h>
#include <string.h>
#include <alloca.h>
#include <stdio.h>
#endif
#include "lz.h"


static uint32_t lzWrite(uint32_t x,uint8_t* buf){	//write val-length number, return number of bytes used

	uint32_t d, nb = 0;
	
	do {
		d = x & 0x7F;
		x = x >> 7;

		if (x)
			d |= 0x80;

		*buf++ = d;
		nb++;

	} while (x);

	return nb;
}

static uint32_t lzRead(uint32_t* xP, const uint8_t* buf){	//read val-length number, return number of bytes used

	uint32_t d, nb = 0, x = 0, shiftBy = 0;

	do{

		d = *buf++;
		x += ((d & 0x7F) << shiftBy);
		nb++;
		shiftBy += 7;

	} while(d & 0x80);
	
	*xP = x;
	return nb;
}
#ifdef DESKTOP
	
	#define LZ_NUM_SETS	256
	#define LZ_NUM_WAYS	4
	
	
	static uint32_t fnvHash(const uint8_t *data)
	{
		uint32_t ret = 0x177;
		
		ret ^= *data++;
		ret *= 0x01000193;
		
		ret ^= *data++;
		ret *= 0x01000193;
		
		ret ^= *data++;
		ret *= 0x01000193;
		
		ret ^= *data++;
		ret *= 0x01000193;

		return ret;
	}
	
	//fixed memory but fast
	uint32_t lzCompressSingleThreadFast(uint8_t* out, uint8_t marker, const uint8_t* in, uint32_t insize, uint32_t blockSz, uint32_t extraBackreferenceSpaceAvail)
	{
		uint32_t ofsts[LZ_NUM_SETS][LZ_NUM_WAYS] = {0,}, nextWay = 0;
		uint32_t inpos = 0, outpos = 0, lastHashed = 0;
		
		out[outpos++] = in[inpos++];
		
		do {
			uint32_t maxMatchLen, bestLength = 0, bestOffset = 0, way, *set;
			
			//hash new data
			while (lastHashed < inpos) {
				ofsts[fnvHash(in + lastHashed) % LZ_NUM_SETS][nextWay++ % LZ_NUM_WAYS] = lastHashed;
				lastHashed++;
			}
			
			//hash-accelerated search
			maxMatchLen = insize - inpos;
			
			for (set = ofsts[fnvHash(in + inpos) % LZ_NUM_SETS],way = 0; way < LZ_NUM_WAYS; way++) {
				
				uint32_t matchLen = 0, possibleOfst = set[way];
				
				//check for actual match (hash does not promise anything)
				for (matchLen = 0; matchLen < maxMatchLen && in[possibleOfst + matchLen] == in[inpos + matchLen]; matchLen++);
				
				//is it better than before ?
				if (matchLen > bestLength) {
					bestLength = matchLen;
					bestOffset = possibleOfst;
				}
			}
			
			//offset should be relative to cur pos
			bestOffset = inpos - bestOffset;
			
			// how big will the encoded backreference be?
			if (bestLength >= 3 /* never makes sense to do that, and we do not even have a way to encode that */) {
				
				uint32_t encodingSize = 1 + (32 - __builtin_clz(bestLength) + 6) / 7 + (32 - __builtin_clz(bestOffset) + 6) / 7;
			
				if (encodingSize < bestLength) {
					
					out[outpos++] = marker;
					outpos += lzWrite(bestOffset, out + outpos);
					outpos += lzWrite(bestLength - 3, out + outpos);
					inpos += bestLength;
					continue;
				}
			}
			
			if (in[inpos] == marker) {	//raw marker byte? emit escape
				
				out[outpos++] = in[inpos++];
				out[outpos++] = 0;
			}
			else {								//copy byte
				
				out[outpos++] = in[inpos++];
			}
			
		} while(inpos < insize);
	
		return outpos;
	}
	
	struct HashEntry {
		struct HashEntry *next;
		uint32_t ofst;
	};
	
	static uint32_t lzHashCollapse(uint32_t raw)
	{
		raw ^= raw >> 10;
		raw ^= raw >> 10;
		
		return raw & 0xfff;
	}
	
	uint32_t lzCompressSingleThread(uint8_t* out, uint8_t marker, const uint8_t* inb, uint32_t insize, uint32_t blockSz, uint32_t extraBackreferenceSpaceAvail)
	{
		struct HashEntry *heads[4096] = {0,}, *hashes = calloc(sizeof(struct HashEntry), insize);	//could be big!
		int32_t inpos = 0, outpos = 0, lastHashed = 0, nextFreeHashEntry = 0;
		struct HashEntry *opt;
		
		out[outpos++] = inb[inpos++];
	
		do {
			
			uint32_t bestLength = 0, bestOffset = 0;
			
			//hash new data
			while (lastHashed < inpos) {
				
				uint32_t hash = lzHashCollapse(fnvHash(inb + lastHashed));
				struct HashEntry **p = heads + hash;
				
				opt = hashes + nextFreeHashEntry++;
				opt->next = *p;
				*p = opt;
				opt->ofst = lastHashed;
				
				lastHashed++;
			}
			
			
			//hash-accelerated search
			for (opt = heads[lzHashCollapse(fnvHash(inb + inpos))]; opt; opt = opt->next) {
				
				uint32_t possibleOfst = opt->ofst;
				uint32_t maxMatchLen = insize - inpos;
				uint32_t matchLen = 0;
				
				//if we are too far back, do not bother (speed opt)
				if (inpos - possibleOfst > blockSz)
					break;
				
				//check for actual match (hash does not promise anything)
				for (matchLen = 0; matchLen < maxMatchLen && inb[possibleOfst + matchLen] == inb[inpos + matchLen]; matchLen++);
				
				//is it better than before ?
				if (matchLen > bestLength) {
					bestLength = matchLen;
					bestOffset = possibleOfst;
				}
			}
			
			//offset should be relative to cur pos
			bestOffset = inpos - bestOffset;
			
			// how big will the encoded backreference be?
			if (bestLength >= 3 /* never makes sense to do that, and we do not even have a way to encode that */) {
				
				uint32_t encodingSize = 1 + (32 - __builtin_clz(bestLength) + 6) / 7 + (32 - __builtin_clz(bestOffset) + 6) / 7;
			
				if (encodingSize < bestLength) {
					
					out[outpos++] = marker;
					outpos += lzWrite(bestOffset, out + outpos);
					outpos += lzWrite(bestLength - 3, out + outpos);
					inpos += bestLength;
					continue;
				}
			}
			
			if (inb[inpos] == marker) {	//raw marker byte? emit escape
				
				out[outpos++] = inb[inpos++];
				out[outpos++] = 0;
			}
			else {								//copy byte
				
				out[outpos++] = inb[inpos++];
			}
			
		} while(inpos < insize);
	
		return outpos;
	}
	
	struct CompressParam {
		uint8_t* out;
		const uint8_t *in;
		uint32_t insize;
		uint32_t outsize;
		uint32_t blockSz;
		uint32_t extraBackreferenceSpaceAvail;
		pthread_t me;
		uint8_t marker;
	};
	
	static void* lzCompressMultithread(void* param)
	{
		struct CompressParam *cp = (struct CompressParam*)param;
		
		cp->outsize = lzCompressSingleThread(cp->out, cp->marker, cp->in, cp->insize, cp->blockSz, cp->extraBackreferenceSpaceAvail);
		
		return NULL;
	}

	uint32_t lzCompress(const uint8_t* inb, uint8_t* out, uint32_t insize, uint32_t blockSz, uint32_t nCpus)
	{
		uint32_t i, histogram[256] = {0, }, outpos = 0;
		uint8_t marker;
	
		// Do we have anything to compress?
		if (!insize)
			return 0;
	
		// Create histogram
		for (i = 0; i < insize; i++)
			histogram[inb[i]]++;
	
		// Find the least common byte, and use it as the code marker
		marker = 0;
		for (i = 1; i < 256; i++){
			if (histogram[i] < histogram[marker])
				marker = i;
		}
	
		out[outpos++] = marker;
		
		if (nCpus > 1 && nCpus <= 64) {
			
			struct CompressParam *cps = alloca(sizeof(struct CompressParam) * nCpus);
		
			//start them
			for (i = 0; i < nCpus; i++) {
				
				uint32_t myWorkSize = (i == nCpus - 1) ? insize - (insize / nCpus) * (nCpus - 1) : (insize / nCpus);
				uint32_t myWorkStart = (insize / nCpus) * i;
				
				cps[i].out = malloc(myWorkSize * 4);	//definitely enough mem
				if (!cps[i].out)
					abort();
				
				cps[i].in = inb + myWorkStart;
				cps[i].insize = myWorkSize;
				cps[i].blockSz = blockSz;
				cps[i].extraBackreferenceSpaceAvail = myWorkStart;
				cps[i].marker = marker;
				
				if (pthread_create(&cps[i].me, NULL, lzCompressMultithread, &cps[i]))
					abort();
			}
			
			//wait for them
			for (i = 0; i < nCpus; i++) {
				if (pthread_join(cps[i].me, NULL))
					abort();
			}
		
			//collect results and free memory
			for (i = 0; i < nCpus; i++) {
				memcpy(out + outpos, cps[i].out, cps[i].outsize);
				outpos += cps[i].outsize;
				free(cps[i].out);
			}
			
			return outpos;
		}
	
		return lzCompressSingleThread(out + outpos, marker, inb, insize, blockSz, 0);
	}

#endif


uint32_t lzUncompress(const uint8_t* inb, uint8_t* out, uint32_t insize)
{
	uint32_t i, inpos = 0, outpos = 0, length, offset;
	uint8_t marker, symbol;

	// Do we have anything to compress?
	if (!insize)
		return 0;

	// Get marker symbol from input stream
	marker = inb[inpos++];
	out[outpos++] = inb[inpos++];

	do {

		symbol = inb[inpos++];
		if (symbol != marker) {		//raw byte? copy it
			
			out[outpos++] = symbol;
		}
		else if (!inb[inpos]){		//emit a single marker byte
			
			out[outpos++] = marker;
			inpos++;
		}
		else {						//backreference
			
			//read parameters
			inpos += lzRead(&offset, inb + inpos);
			inpos += lzRead(&length, inb + inpos);
			
			length += 3;
			
			//copy data
			for (i = 0; i < length; i++, outpos++)
				out[outpos] = out[outpos - offset];
		}
	} while (inpos < insize);

	return outpos;
}

