#ifndef __INC_FASTSPI_BITBANG_H
#define __INC_FASTSPI_BITBANG_H

#include "FastLED.h"

#include "fastled_delay.h"

FASTLED_NAMESPACE_BEGIN

//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
//
// Software SPI (aka bit-banging) support - with aggressive optimizations for when the clock and data pin are on the same port
//
// TODO: Replace the select pin definition with a set of pins, to allow using mux hardware for routing in the future
//
//////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////

template <uint8_t DATA_PIN, uint8_t CLOCK_PIN, uint32_t SPI_SPEED>
class AVRSoftwareSPIOutput {
	// The data types for pointers to the pin port - typedef'd here from the Pin definition because on avr these
	// are pointers to 8 bit values, while on arm they are 32 bit
	typedef typename FastPin<DATA_PIN>::port_ptr_t data_ptr_t;
	typedef typename FastPin<CLOCK_PIN>::port_ptr_t clock_ptr_t;

	// The data type for what's at a pin's port - typedef'd here from the Pin definition because on avr the ports
	// are 8 bits wide while on arm they are 32.
	typedef typename FastPin<DATA_PIN>::port_t data_t;
	typedef typename FastPin<CLOCK_PIN>::port_t clock_t;
	Selectable 	*m_pSelect;

public:
	AVRSoftwareSPIOutput() { m_pSelect = NULL; }
	AVRSoftwareSPIOutput(Selectable *pSelect) { m_pSelect = pSelect; }
	void setSelect(Selectable *pSelect) { m_pSelect = pSelect; }

	void init() {
		// set the pins to output and make sure the select is released (which apparently means hi?  This is a bit
		// confusing to me)
		FastPin<DATA_PIN>::setOutput();
		FastPin<CLOCK_PIN>::setOutput();
		release();
	}

	// stop the SPI output.  Pretty much a NOP with software, as there's no registers to kick
	static void stop() { }

	// wait until the SPI subsystem is ready for more data to write.  A NOP when bitbanging
	static void wait() __attribute__((always_inline)) { }
	static void waitFully() __attribute__((always_inline)) { wait(); }

	static void writeByteNoWait(uint8_t b) __attribute__((always_inline)) { writeByte(b); }
	static void writeBytePostWait(uint8_t b) __attribute__((always_inline)) { writeByte(b); wait(); }

	static void writeWord(uint16_t w) __attribute__((always_inline)) { writeByte(w>>8); writeByte(w&0xFF); }

	// naive writeByte implelentation, simply calls writeBit on the 8 bits in the byte.
	static void writeByte(uint8_t b) {
		writeBit<7>(b);
		writeBit<6>(b);
		writeBit<5>(b);
		writeBit<4>(b);
		writeBit<3>(b);
		writeBit<2>(b);
		writeBit<1>(b);
		writeBit<0>(b);
	}

private:
	// writeByte implementation with data/clock registers passed in.
	static void writeByte(uint8_t b, clock_ptr_t clockpin, data_ptr_t datapin)  {
		writeBit<7>(b, clockpin, datapin);
		writeBit<6>(b, clockpin, datapin);
		writeBit<5>(b, clockpin, datapin);
		writeBit<4>(b, clockpin, datapin);
		writeBit<3>(b, clockpin, datapin);
		writeBit<2>(b, clockpin, datapin);
		writeBit<1>(b, clockpin, datapin);
		writeBit<0>(b, clockpin, datapin);
	}

	// writeByte implementation with the data register passed in and prebaked values for data hi w/clock hi and
	// low and data lo w/clock hi and lo.  This is to be used when clock and data are on the same GPIO register,
	// can get close to getting a bit out the door in 2 clock cycles!
	static void writeByte(uint8_t b, data_ptr_t datapin,
						  data_t hival, data_t loval,
						  clock_t hiclock, clock_t loclock) {
		writeBit<7>(b, datapin, hival, loval, hiclock, loclock);
		writeBit<6>(b, datapin, hival, loval, hiclock, loclock);
		writeBit<5>(b, datapin, hival, loval, hiclock, loclock);
		writeBit<4>(b, datapin, hival, loval, hiclock, loclock);
		writeBit<3>(b, datapin, hival, loval, hiclock, loclock);
		writeBit<2>(b, datapin, hival, loval, hiclock, loclock);
		writeBit<1>(b, datapin, hival, loval, hiclock, loclock);
		writeBit<0>(b, datapin, hival, loval, hiclock, loclock);
	}

	// writeByte implementation with not just registers passed in, but pre-baked values for said registers for
	// data hi/lo and clock hi/lo values.  Note: weird things will happen if this method is called in cases where
	// the data and clock pins are on the same port!  Don't do that!
	static void writeByte(uint8_t b, clock_ptr_t clockpin, data_ptr_t datapin,
						  data_t hival, data_t loval,
						  clock_t hiclock, clock_t loclock) {
		writeBit<7>(b, clockpin, datapin, hival, loval, hiclock, loclock);
		writeBit<6>(b, clockpin, datapin, hival, loval, hiclock, loclock);
		writeBit<5>(b, clockpin, datapin, hival, loval, hiclock, loclock);
		writeBit<4>(b, clockpin, datapin, hival, loval, hiclock, loclock);
		writeBit<3>(b, clockpin, datapin, hival, loval, hiclock, loclock);
		writeBit<2>(b, clockpin, datapin, hival, loval, hiclock, loclock);
		writeBit<1>(b, clockpin, datapin, hival, loval, hiclock, loclock);
		writeBit<0>(b, clockpin, datapin, hival, loval, hiclock, loclock);
	}

public:

	// We want to make sure that the clock pulse is held high for a nininum of 35ns.
#if defined(FASTLED_TEENSY4)
	#define DELAY_NS (1000 / (SPI_SPEED/1000000))
	#define CLOCK_HI_DELAY do { delayNanoseconds((DELAY_NS/4)); } while(0);
	#define CLOCK_LO_DELAY do { delayNanoseconds((DELAY_NS/4)); } while(0);
#else
	#define MIN_DELAY ((NS(35)>3) ? (NS(35) - 3) : 1)

	#define CLOCK_HI_DELAY do { delaycycles<MIN_DELAY>(); delaycycles<((SPI_SPEED > 10) ? (((SPI_SPEED-6) / 2) - MIN_DELAY) : (SPI_SPEED))>(); } while(0);
	#define CLOCK_LO_DELAY do { delaycycles<((SPI_SPEED > 10) ? ((SPI_SPEED-6) / 2) : (SPI_SPEED))>(); } while(0);
#endif

	// write the BIT'th bit out via spi, setting the data pin then strobing the clcok
	template <uint8_t BIT> __attribute__((always_inline, hot)) inline static void writeBit(uint8_t b) {
		//cli();
		if(b & (1 << BIT)) {
			FastPin<DATA_PIN>::hi();
#ifdef ESP32
			// try to ensure we never have adjacent write opcodes to the same register
			FastPin<CLOCK_PIN>::lo();
			FastPin<CLOCK_PIN>::hi(); CLOCK_HI_DELAY;
			FastPin<CLOCK_PIN>::toggle(); CLOCK_LO_DELAY;
#else
			FastPin<CLOCK_PIN>::hi(); CLOCK_HI_DELAY;
			FastPin<CLOCK_PIN>::lo(); CLOCK_LO_DELAY;
#endif
		} else {
			FastPin<DATA_PIN>::lo();
			FastPin<CLOCK_PIN>::hi(); CLOCK_HI_DELAY;
#ifdef ESP32
			// try to ensure we never have adjacent write opcodes to the same register
			FastPin<CLOCK_PIN>::toggle(); CLOCK_HI_DELAY;
#else
			FastPin<CLOCK_PIN>::lo(); CLOCK_LO_DELAY;
#endif
		}
		//sei();
	}

private:
	// write the BIT'th bit out via spi, setting the data pin then strobing the clock, using the passed in pin registers to accelerate access if needed
	template <uint8_t BIT> __attribute__((always_inline)) inline static void writeBit(uint8_t b, clock_ptr_t clockpin, data_ptr_t datapin) {
		if(b & (1 << BIT)) {
			FastPin<DATA_PIN>::hi(datapin);
			FastPin<CLOCK_PIN>::hi(clockpin); CLOCK_HI_DELAY;
			FastPin<CLOCK_PIN>::lo(clockpin); CLOCK_LO_DELAY;
		} else {
			FastPin<DATA_PIN>::lo(datapin);
			FastPin<CLOCK_PIN>::hi(clockpin); CLOCK_HI_DELAY;
			FastPin<CLOCK_PIN>::lo(clockpin); CLOCK_LO_DELAY;
		}

	}

	// the version of write to use when clock and data are on separate pins with precomputed values for setting
	// the clock and data pins
	template <uint8_t BIT> __attribute__((always_inline)) inline static void writeBit(uint8_t b, clock_ptr_t clockpin, data_ptr_t datapin,
													data_t hival, data_t loval, clock_t hiclock, clock_t loclock) {
		// // only need to explicitly set clock hi if clock and data are on different ports
		if(b & (1 << BIT)) {
			FastPin<DATA_PIN>::fastset(datapin, hival);
			FastPin<CLOCK_PIN>::fastset(clockpin, hiclock); CLOCK_HI_DELAY;
			FastPin<CLOCK_PIN>::fastset(clockpin, loclock); CLOCK_LO_DELAY;
		} else {
			// FL_NOP;
			FastPin<DATA_PIN>::fastset(datapin, loval);
			FastPin<CLOCK_PIN>::fastset(clockpin, hiclock); CLOCK_HI_DELAY;
			FastPin<CLOCK_PIN>::fastset(clockpin, loclock); CLOCK_LO_DELAY;
		}
	}

	// the version of write to use when clock and data are on the same port with precomputed values for the various
	// combinations
	template <uint8_t BIT> __attribute__((always_inline)) inline static void writeBit(uint8_t b, data_ptr_t clockdatapin,
													data_t datahiclockhi, data_t dataloclockhi,
													data_t datahiclocklo, data_t dataloclocklo) {
#if 0
		writeBit<BIT>(b);
#else
		if(b & (1 << BIT)) {
			FastPin<DATA_PIN>::fastset(clockdatapin, datahiclocklo);
			FastPin<DATA_PIN>::fastset(clockdatapin, datahiclockhi); CLOCK_HI_DELAY;
			FastPin<DATA_PIN>::fastset(clockdatapin, datahiclocklo); CLOCK_LO_DELAY;
		} else {
			// FL_NOP;
			FastPin<DATA_PIN>::fastset(clockdatapin, dataloclocklo);
			FastPin<DATA_PIN>::fastset(clockdatapin, dataloclockhi); CLOCK_HI_DELAY;
			FastPin<DATA_PIN>::fastset(clockdatapin, dataloclocklo); CLOCK_LO_DELAY;
		}
#endif
	}
public:

	// select the SPI output (TODO: research whether this really means hi or lo.  Alt TODO: move select responsibility out of the SPI classes
	// entirely, make it up to the caller to remember to lock/select the line?)
	void select() { if(m_pSelect != NULL) { m_pSelect->select(); } } // FastPin<SELECT_PIN>::hi(); }

	// release the SPI line
	void release() { if(m_pSelect != NULL) { m_pSelect->release(); } } // FastPin<SELECT_PIN>::lo(); }

	// Write out len bytes of the given value out over SPI.  Useful for quickly flushing, say, a line of 0's down the line.
	void writeBytesValue(uint8_t value, int len) {
		select();
		writeBytesValueRaw(value, len);
		release();
	}

	static void writeBytesValueRaw(uint8_t value, int len) {
#ifdef FAST_SPI_INTERRUPTS_WRITE_PINS
		// TODO: Weird things may happen if software bitbanging SPI output and other pins on the output reigsters are being twiddled.  Need
		// to allow specifying whether or not exclusive i/o access is allowed during this process, and if i/o access is not allowed fall
		// back to the degenerative code below
		while(len--) {
			writeByte(value);
		}
#else
		register data_ptr_t datapin = FastPin<DATA_PIN>::port();

		if(FastPin<DATA_PIN>::port() != FastPin<CLOCK_PIN>::port()) {
			// If data and clock are on different ports, then writing a bit will consist of writing the value foor
			// the bit (hi or low) to the data pin port, and then two writes to the clock port to strobe the clock line
			register clock_ptr_t clockpin = FastPin<CLOCK_PIN>::port();
			register data_t datahi = FastPin<DATA_PIN>::hival();
			register data_t datalo = FastPin<DATA_PIN>::loval();
			register clock_t clockhi = FastPin<CLOCK_PIN>::hival();
			register clock_t clocklo = FastPin<CLOCK_PIN>::loval();
			while(len--) {
				writeByte(value, clockpin, datapin, datahi, datalo, clockhi, clocklo);
			}

		} else {
			// If data and clock are on the same port then we can combine setting the data and clock pins
			register data_t datahi_clockhi = FastPin<DATA_PIN>::hival() | FastPin<CLOCK_PIN>::mask();
			register data_t datalo_clockhi = FastPin<DATA_PIN>::loval() | FastPin<CLOCK_PIN>::mask();
			register data_t datahi_clocklo = FastPin<DATA_PIN>::hival() & ~FastPin<CLOCK_PIN>::mask();
			register data_t datalo_clocklo = FastPin<DATA_PIN>::loval() & ~FastPin<CLOCK_PIN>::mask();

			while(len--) {
				writeByte(value, datapin, datahi_clockhi, datalo_clockhi, datahi_clocklo, datalo_clocklo);
			}
		}
#endif
	}

	// write a block of len uint8_ts out.  Need to type this better so that explicit casts into the call aren't required.
	// note that this template version takes a class parameter for a per-byte modifier to the data.
	template <class D> void writeBytes(register uint8_t *data, int len) {
		select();
#ifdef FAST_SPI_INTERRUPTS_WRITE_PINS
		uint8_t *end = data + len;
		while(data != end) {
			writeByte(D::adjust(*data++));
		}
#else
		register clock_ptr_t clockpin = FastPin<CLOCK_PIN>::port();
		register data_ptr_t datapin = FastPin<DATA_PIN>::port();

		if(FastPin<DATA_PIN>::port() != FastPin<CLOCK_PIN>::port()) {
			// If data and clock are on different ports, then writing a bit will consist of writing the value foor
			// the bit (hi or low) to the data pin port, and then two writes to the clock port to strobe the clock line
			register data_t datahi = FastPin<DATA_PIN>::hival();
			register data_t datalo = FastPin<DATA_PIN>::loval();
			register clock_t clockhi = FastPin<CLOCK_PIN>::hival();
			register clock_t clocklo = FastPin<CLOCK_PIN>::loval();
			uint8_t *end = data + len;

			while(data != end) {
				writeByte(D::adjust(*data++), clockpin, datapin, datahi, datalo, clockhi, clocklo);
			}

		} else {
			// FastPin<CLOCK_PIN>::hi();
			// If data and clock are on the same port then we can combine setting the data and clock pins
			register data_t datahi_clockhi = FastPin<DATA_PIN>::hival() | FastPin<CLOCK_PIN>::mask();
			register data_t datalo_clockhi = FastPin<DATA_PIN>::loval() | FastPin<CLOCK_PIN>::mask();
			register data_t datahi_clocklo = FastPin<DATA_PIN>::hival() & ~FastPin<CLOCK_PIN>::mask();
			register data_t datalo_clocklo = FastPin<DATA_PIN>::loval() & ~FastPin<CLOCK_PIN>::mask();

			uint8_t *end = data + len;

			while(data != end) {
				writeByte(D::adjust(*data++), datapin, datahi_clockhi, datalo_clockhi, datahi_clocklo, datalo_clocklo);
			}
			// FastPin<CLOCK_PIN>::lo();
		}
#endif
		D::postBlock(len);
		release();
	}

	// default version of writing a block of data out to the SPI port, with no data modifications being made
	void writeBytes(register uint8_t *data, int len) { writeBytes<DATA_NOP>(data, len); }


	// write a block of uint8_ts out in groups of three.  len is the total number of uint8_ts to write out.  The template
	// parameters indicate how many uint8_ts to skip at the beginning of each grouping, as well as a class specifying a per
	// byte of data modification to be made.  (See DATA_NOP above)
	template <uint8_t FLAGS, class D, EOrder RGB_ORDER>  __attribute__((noinline)) void writePixels(PixelController<RGB_ORDER> pixels) {
		select();
		int len = pixels.mLen;

#ifdef FAST_SPI_INTERRUPTS_WRITE_PINS
		// If interrupts or other things may be generating output while we're working on things, then we need
		// to use this block
		while(pixels.has(1)) {
			if(FLAGS & FLAG_START_BIT) {
				writeBit<0>(1);
			}
			writeByte(D::adjust(pixels.loadAndScale0()));
			writeByte(D::adjust(pixels.loadAndScale1()));
			writeByte(D::adjust(pixels.loadAndScale2()));
			pixels.advanceData();
			pixels.stepDithering();
		}
#else
		// If we can guaruntee that no one else will be writing data while we are running (namely, changing the values of the PORT/PDOR pins)
		// then we can use a bunch of optimizations in here
		register data_ptr_t datapin = FastPin<DATA_PIN>::port();

		if(FastPin<DATA_PIN>::port() != FastPin<CLOCK_PIN>::port()) {
			register clock_ptr_t clockpin = FastPin<CLOCK_PIN>::port();
			// If data and clock are on different ports, then writing a bit will consist of writing the value foor
			// the bit (hi or low) to the data pin port, and then two writes to the clock port to strobe the clock line
			register data_t datahi = FastPin<DATA_PIN>::hival();
			register data_t datalo = FastPin<DATA_PIN>::loval();
			register clock_t clockhi = FastPin<CLOCK_PIN>::hival();
			register clock_t clocklo = FastPin<CLOCK_PIN>::loval();

			while(pixels.has(1)) {
				if(FLAGS & FLAG_START_BIT) {
					writeBit<0>(1, clockpin, datapin, datahi, datalo, clockhi, clocklo);
				}
				writeByte(D::adjust(pixels.loadAndScale0()), clockpin, datapin, datahi, datalo, clockhi, clocklo);
				writeByte(D::adjust(pixels.loadAndScale1()), clockpin, datapin, datahi, datalo, clockhi, clocklo);
				writeByte(D::adjust(pixels.loadAndScale2()), clockpin, datapin, datahi, datalo, clockhi, clocklo);
				pixels.advanceData();
				pixels.stepDithering();
			}

		} else {
			// If data and clock are on the same port then we can combine setting the data and clock pins
			register data_t datahi_clockhi = FastPin<DATA_PIN>::hival() | FastPin<CLOCK_PIN>::mask();
			register data_t datalo_clockhi = FastPin<DATA_PIN>::loval() | FastPin<CLOCK_PIN>::mask();
			register data_t datahi_clocklo = FastPin<DATA_PIN>::hival() & ~FastPin<CLOCK_PIN>::mask();
			register data_t datalo_clocklo = FastPin<DATA_PIN>::loval() & ~FastPin<CLOCK_PIN>::mask();

			while(pixels.has(1)) {
				if(FLAGS & FLAG_START_BIT) {
					writeBit<0>(1, datapin, datahi_clockhi, datalo_clockhi, datahi_clocklo, datalo_clocklo);
				}
				writeByte(D::adjust(pixels.loadAndScale0()), datapin, datahi_clockhi, datalo_clockhi, datahi_clocklo, datalo_clocklo);
				writeByte(D::adjust(pixels.loadAndScale1()), datapin, datahi_clockhi, datalo_clockhi, datahi_clocklo, datalo_clocklo);
				writeByte(D::adjust(pixels.loadAndScale2()), datapin, datahi_clockhi, datalo_clockhi, datahi_clocklo, datalo_clocklo);
				pixels.advanceData();
				pixels.stepDithering();
			}
		}
#endif
		D::postBlock(len);
		release();
	}
};

FASTLED_NAMESPACE_END

#endif
