/**
 *  Process Block
 *
 *  Copyright (C) 2006-2014 Teru Kamogashira
 *
 *  This program is free software; you can redistribute it and/or modify
 *  it under the terms of the GNU General Public License as published by
 *  the Free Software Foundation; either version 2 of the License, or
 *  (at your option) any later version.
 *
 *  This program is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 *  GNU General Public License for more details.
 *
 *  You should have received a copy of the GNU General Public License
 *  along with this program; if not, write to the Free Software
 *  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
 */

#include "ProcessBlock.hpp"

#ifdef WIN32
static unsigned __stdcall ProcessBlockThread(void *vdParam)
#else
static void * ProcessBlockThread(void *vdParam)
#endif
{
  ProcessBlockThreadInfo *info = (ProcessBlockThreadInfo*)vdParam;
#ifdef WIN32
  unsigned int threadId = GetCurrentThreadId();
  wchar_t eventName_StartThread[_MAX_PATH];
  wchar_t eventName_ThreadEnded[_MAX_PATH];
  wsprintfW(eventName_StartThread, PB3_EVENT_PREFIX L"%d" PB3_EVENT_START, threadId);
  wsprintfW(eventName_ThreadEnded, PB3_EVENT_PREFIX L"%d" PB3_EVENT_ENDED, threadId);
  HANDLE event_StartThread = OpenEventW(EVENT_ALL_ACCESS, FALSE, eventName_StartThread);
  HANDLE event_ThreadEnded = OpenEventW(EVENT_ALL_ACCESS, FALSE, eventName_ThreadEnded);
#endif
  volatile int _threadFlags = PB3_THREAD_FLAG_0;
  while(1)
    {
#ifdef WIN32
      Sleep(0);
      WaitForSingleObject(event_StartThread, INFINITE);
      ResetEvent(event_StartThread);
#else
      sleep(0);
      info->threadStart->wait();
      info->threadStart->reset();
#endif
      while(1)
	{
	  info->mainSection->lock();
	  _threadFlags = *info->threadFlags;
	  info->mainSection->unlock();
	  if(_threadFlags == PB3_THREAD_FLAG_0) break;
	  if(_threadFlags & PB3_THREAD_FLAG_EXIT)
	    {
	      info->mainSection->lock();
	      *info->threadFlags ^= PB3_THREAD_FLAG_EXIT;
	      info->mainSection->unlock();
	      goto exit_wait_loop;
	    }
	  if(_threadFlags & PB3_THREAD_FLAG_RUN_HALF)
	    {
	      info->mainSection->lock();
	      long bSize = (*info->latencySize)/2;
	      pfloat_t *inL = info->inputBlock->L, *inR = info->inputBlock->R, *outL = info->outputBlock->L, *outR = info->outputBlock->R;
	      info->hostClass->processLRModel(inL, inR, outL, outR, bSize);
	      *info->threadFlags ^= PB3_THREAD_FLAG_RUN_HALF;
	      info->mainSection->unlock();
	      continue;
	    }
	  if(_threadFlags & PB3_THREAD_FLAG_RUN_FULL)
	    {
	      info->mainSection->lock();
	      long bSize = (*info->latencySize)/2;
	      pfloat_t *inL = info->inputBlock->L, *inR = info->inputBlock->R, *outL = info->outputBlock->L, *outR = info->outputBlock->R;
	      inL += bSize, inR += bSize, outL += bSize, outR += bSize;
	      info->hostClass->processLRModel(inL, inR, outL, outR, bSize);
	      *info->threadFlags ^= PB3_THREAD_FLAG_RUN_FULL;
#ifdef WIN32
	      SetEvent(event_ThreadEnded);
#else
	      info->threadEnded->trigger();
#endif
	      info->mainSection->unlock();
	      continue;
	    }
	}
    }
 exit_wait_loop:
#ifdef WIN32
  CloseHandle(event_StartThread);
  CloseHandle(event_ThreadEnded);
  _endthreadex(0);
#else
  pthread_exit(NULL);
#endif
  return 0; 
}

void ProcessBlock::processLRModelThreaded(pfloat_t *inL, pfloat_t *inR, pfloat_t *outL, pfloat_t *outR, VstInt32 sampleFrames)
{
  while(true)
    {
      if(savedSize == -1)
	{
#ifdef WIN32
	  SetEvent(event_trigger);
	  WaitForSingleObject(event_waitfor, INFINITE);
	  ResetEvent(event_waitfor);
#else
	  threadStart.trigger();
	  threadEnded.wait();
	  threadEnded.reset();
#endif
	  savedSize = 0; runHalf = false;
	  memcpy(latencyBlockOutput.L, latencyBlockThreadOutput.L, sizeof(pfloat_t)*latencySize);
	  memcpy(latencyBlockOutput.R, latencyBlockThreadOutput.R, sizeof(pfloat_t)*latencySize);
	}
      long copySize = latencySize - savedSize;
      if(copySize > sampleFrames) copySize = sampleFrames;
      memcpy(latencyBlockInput.L+savedSize, inL, sizeof(pfloat_t)*copySize);
      memcpy(latencyBlockInput.R+savedSize, inR, sizeof(pfloat_t)*copySize);
      memcpy(outL, latencyBlockOutput.L+savedSize, sizeof(pfloat_t)*copySize);
      memcpy(outR, latencyBlockOutput.R+savedSize, sizeof(pfloat_t)*copySize);

      savedSize += copySize;
      inL += copySize; inR += copySize; outL += copySize; outR += copySize;
      sampleFrames -= copySize;

      if(savedSize >= latencySize/2&&runHalf == false)
	{
	  memcpy(latencyBlockThreadInput.L, latencyBlockInput.L, sizeof(pfloat_t)*latencySize/2);
	  memcpy(latencyBlockThreadInput.R, latencyBlockInput.R, sizeof(pfloat_t)*latencySize/2);
	  mainSection.lock();
	  threadFlags |= PB3_THREAD_FLAG_RUN_HALF;
	  mainSection.unlock();
#ifdef WIN32
	  SetEvent(event_trigger);
#else
	  threadStart.trigger();
#endif
	  runHalf = true;
	}
      
      if(savedSize == latencySize)
	{
	  memcpy(latencyBlockThreadInput.L+latencySize/2, latencyBlockInput.L+latencySize/2, sizeof(pfloat_t)*latencySize/2);
	  memcpy(latencyBlockThreadInput.R+latencySize/2, latencyBlockInput.R+latencySize/2, sizeof(pfloat_t)*latencySize/2);
	  mainSection.lock();
	  threadFlags |= PB3_THREAD_FLAG_RUN_FULL;
	  mainSection.unlock();
#ifdef WIN32
	  SetEvent(event_trigger);
#else
	  threadStart.trigger();
#endif
	  savedSize = -1;
	}

      if(sampleFrames <= 0) break;
    }
}

ProcessBlock::ProcessBlock()
{
  processBlockSize = 0;
  latencySize = 0;
  hostInfo.hostClass = this;
  hostInfo.latencySize = &latencySize;
  hostInfo.inputBlock = &latencyBlockThreadInput;
  hostInfo.outputBlock = &latencyBlockThreadOutput;
  hostInfo.threadFlags = &threadFlags;
  hostInfo.mainSection = &mainSection;
#ifndef WIN32
  hostInfo.threadStart = &threadStart;
  hostInfo.threadEnded = &threadEnded;
#endif
  resume();
}

ProcessBlock::~ProcessBlock()
{
  suspend();
  freeProcessBlock();
}

void ProcessBlock::resume()
{
  mainSection.lock();
  savedSize = 0; runHalf = false;
  threadFlags = PB3_THREAD_FLAG_0;
  latencyBlockInput.mute();
  latencyBlockOutput.mute();
  latencyBlockThreadInput.mute();
  latencyBlockThreadOutput.mute();
#ifdef WIN32
  PBThreadHandle = (HANDLE)_beginthreadex(NULL, 0, ProcessBlockThread, &hostInfo, CREATE_SUSPENDED, &threadId);
  wsprintfW(eventName_StartThread, PB3_EVENT_PREFIX L"%d" PB3_EVENT_START, threadId);
  wsprintfW(eventName_ThreadEnded, PB3_EVENT_PREFIX L"%d" PB3_EVENT_ENDED, threadId);
  event_StartThread = CreateEventW(NULL, TRUE, FALSE, eventName_StartThread);
  event_ThreadEnded = CreateEventW(NULL, TRUE, TRUE,  eventName_ThreadEnded);
  event_trigger = OpenEventW(EVENT_ALL_ACCESS, FALSE, eventName_StartThread);
  event_waitfor = OpenEventW(EVENT_ALL_ACCESS, FALSE, eventName_ThreadEnded);
  ResumeThread(PBThreadHandle);
#else
  threadStart.reset();
  threadEnded.reset();
  threadEnded.set();
  pthread_create(&PBThreadHandle, NULL, ProcessBlockThread, &hostInfo);
#endif
  mainSection.unlock();
}

void ProcessBlock::suspend()
{
  mainSection.lock();
  threadFlags |= PB3_THREAD_FLAG_EXIT;
  mainSection.unlock();
#ifdef WIN32
  SetEvent(event_trigger);
  WaitForSingleObject(PBThreadHandle, INFINITE);
  mainSection.lock();
  CloseHandle(PBThreadHandle);
  CloseHandle(event_trigger);
  CloseHandle(event_waitfor);
  CloseHandle(event_StartThread);
  CloseHandle(event_ThreadEnded);
  mainSection.unlock();
#else
  threadStart.trigger();
  pthread_join(PBThreadHandle, NULL);
#endif
}

void ProcessBlock::mute()
{
  suspend();
  mainSection.lock();
#ifdef PLUGDOUBLE
  inputBlock.mute();
#endif
  outputBlock.mute();
  latencyBlockInput.mute();
  latencyBlockOutput.mute();
  latencyBlockThreadInput.mute();
  latencyBlockThreadOutput.mute();
  mainSection.unlock();
  resume();
}

VstInt32 ProcessBlock::allocProcessBlock(VstInt32 size)
{
  freeProcessBlock();
  try
    {
#ifdef PLUGDOUBLE
      inputBlock.alloc(size, 2);
#endif
      outputBlock.alloc(size, 2);
    }
  catch(std::bad_alloc)
    {
      freeProcessBlock();
    }
  return (processBlockSize = size);
}

void ProcessBlock::freeProcessBlock()
{
#ifdef PLUGDOUBLE
  inputBlock.free();
#endif
  outputBlock.free();
  processBlockSize = 0;
}

long ProcessBlock::getLatency()
{
  return latencySize;
}

void ProcessBlock::setLatency(long size)
{
  if(size <= 0) return;
  if(size % 2 == 1) size++;
  suspend();
  mainSection.lock();
  try
    {
      latencyBlockInput.alloc(size, 2);
      latencyBlockOutput.alloc(size, 2);
      latencyBlockThreadInput.alloc(size, 2);
      latencyBlockThreadOutput.alloc(size, 2);
    }
  catch(std::bad_alloc)
    {
      latencyBlockInput.free();
      latencyBlockOutput.free();
      latencyBlockThreadInput.free();
      latencyBlockThreadOutput.free();
    }
  latencySize = size;
  mainSection.unlock();
  resume();
  return;
}

void ProcessBlock::float2double(float *i, double *o, int size)
{
  for(int c = 0;c < size;c ++) o[c] = static_cast<double>(i[c]);
}

void ProcessBlock::double2float(double *i, float *o, int size)
{
  for(int c = 0;c < size;c ++) o[c] = static_cast<float>(i[c]);
}

void ProcessBlock::p_process(float **inputs, float **outputs, VstInt32 sampleFrames)
{
  uint32_t mxcsr = UTILS::getMXCSR();
  UTILS::setMXCSR(FV3_FLAG_MXCSR_FZ|FV3_FLAG_MXCSR_DAZ|FV3_FLAG_MXCSR_EMASK_ALL);
  if(processBlockSize < sampleFrames)
    if(allocProcessBlock(sampleFrames) != sampleFrames) return;
#ifdef PLUGDOUBLE
  float2double(inputs[0], inputBlock.L, sampleFrames);
  float2double(inputs[1], inputBlock.R, sampleFrames);
  outputBlock.mute(sampleFrames);
  if(latencySize <= 0)
    processLRModel(inputBlock.L, inputBlock.R, outputBlock.L, outputBlock.R, sampleFrames);
  else
    processLRModelThreaded(inputBlock.L, inputBlock.R, outputBlock.L, outputBlock.R, sampleFrames);
  for(VstInt32 i = 0;i < sampleFrames;i ++)
    {
      outputs[0][i] += static_cast<float>(outputBlock.L[i]);
      outputs[1][i] += static_cast<float>(outputBlock.R[i]);
    }
#else
  outputBlock.mute(sampleFrames);
  if(latencySize <= 0)
    processLRModel(inputs[0], inputs[1], outputBlock.L, outputBlock.R, sampleFrames);
  else
    processLRModelThreaded(inputs[0], inputs[1], outputBlock.L, outputBlock.R, sampleFrames);
  for(VstInt32 i = 0;i < sampleFrames;i ++)
    {
      outputs[0][i] += outputBlock.L[i];
      outputs[1][i] += outputBlock.R[i];
    }
#endif
  UTILS::setMXCSR(mxcsr);
}

void ProcessBlock::p_processReplacing(float **inputs, float **outputs,
				VstInt32 sampleFrames)
{
  uint32_t mxcsr = UTILS::getMXCSR();
  UTILS::setMXCSR(FV3_FLAG_MXCSR_FZ|FV3_FLAG_MXCSR_DAZ|FV3_FLAG_MXCSR_EMASK_ALL);
  if(processBlockSize < sampleFrames) if(allocProcessBlock(sampleFrames) != sampleFrames) return;  
#ifdef PLUGDOUBLE
  float2double(inputs[0], inputBlock.L, sampleFrames);
  float2double(inputs[1], inputBlock.R, sampleFrames);
  double * I[2] = {inputBlock.L, inputBlock.R};
  double * O[2] = {outputBlock.L, outputBlock.R};
  p_processDoubleReplacing(I, O, sampleFrames);
  double2float(outputBlock.L, outputs[0], sampleFrames);
  double2float(outputBlock.R, outputs[1], sampleFrames);
#else
  outputBlock.mute(sampleFrames);
  if(latencySize <= 0)
    processLRModel(inputs[0], inputs[1], outputBlock.L, outputBlock.R, sampleFrames);
  else
    processLRModelThreaded(inputs[0], inputs[1], outputBlock.L, outputBlock.R, sampleFrames);
  memcpy(outputs[0], outputBlock.L, sizeof(float)*sampleFrames);
  memcpy(outputs[1], outputBlock.R, sizeof(float)*sampleFrames);
#endif
  UTILS::setMXCSR(mxcsr);
}

#ifdef PLUGDOUBLE
void ProcessBlock::p_processDoubleReplacing(double **inputs, double **outputs, VstInt32 sampleFrames)
{
  uint32_t mxcsr = UTILS::getMXCSR();
  UTILS::setMXCSR(FV3_FLAG_MXCSR_FZ|FV3_FLAG_MXCSR_DAZ|FV3_FLAG_MXCSR_EMASK_ALL);
  if(processBlockSize < sampleFrames) if(allocProcessBlock(sampleFrames) != sampleFrames) return;  
  memcpy(inputBlock.L, inputs[0], sizeof(double)*sampleFrames);
  memcpy(inputBlock.R, inputs[1], sizeof(double)*sampleFrames);
  outputBlock.mute(sampleFrames);
  if(latencySize <= 0)
    processLRModel(inputBlock.L, inputBlock.R, outputBlock.L, outputBlock.R, sampleFrames);
  else
    processLRModelThreaded(inputBlock.L, inputBlock.R, outputBlock.L, outputBlock.R, sampleFrames);
  memcpy(outputs[0], outputBlock.L, sizeof(double)*sampleFrames);
  memcpy(outputs[1], outputBlock.R, sizeof(double)*sampleFrames);
  UTILS::setMXCSR(mxcsr);
}
#endif

void ProcessBlock::processLRModel(pfloat_t *inL, pfloat_t *inR, pfloat_t *outL, pfloat_t *outR, VstInt32 sampleFrames)
{
  memcpy(outL, inL, sizeof(pfloat_t)*sampleFrames);
  memcpy(outR, inR, sizeof(pfloat_t)*sampleFrames);
}
