///////////////////////////////////////////////////////////////////////////////////
// Copyright (C) 2023 Jon Beniston, M7RCE                                        //
//                                                                               //
// This program is free software; you can redistribute it and/or modify          //
// it under the terms of the GNU General Public License as published by          //
// the Free Software Foundation as version 3 of the License, or                  //
// (at your option) any later version.                                           //
//                                                                               //
// This program is distributed in the hope that it will be useful,               //
// but WITHOUT ANY WARRANTY; without even the implied warranty of                //
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the                  //
// GNU General Public License V3 for more details.                               //
//                                                                               //
// You should have received a copy of the GNU General Public License             //
// along with this program. If not, see .          //
///////////////////////////////////////////////////////////////////////////////////
#include 
#include "dsp/cudavkfftengine.h"
CUDAvkFFTEngine::CUDAvkFFTEngine()
{
    VkFFTResult resFFT;
    resFFT = gpuInit();
    if (resFFT != VKFFT_SUCCESS)
    {
        qDebug() << "CUDAvkFFTEngine::CUDAvkFFTEngine: Failed to initialise GPU" << getVkFFTErrorString(resFFT);
        delete vkGPU;
        vkGPU = nullptr;
    }
}
CUDAvkFFTEngine::~CUDAvkFFTEngine()
{
    if (vkGPU)
    {
        freeAll();
        cuCtxDestroy(vkGPU->context);
    }
}
const QString CUDAvkFFTEngine::m_name = "vkFFT (CUDA)";
QString CUDAvkFFTEngine::getName() const
{
    return m_name;
}
VkFFTResult CUDAvkFFTEngine::gpuInit()
{
    CUresult res = CUDA_SUCCESS;
    cudaError_t res2 = cudaSuccess;
    res = cuInit(0);
    if (res != CUDA_SUCCESS) {
        return VKFFT_ERROR_FAILED_TO_INITIALIZE;
    }
    res2 = cudaSetDevice((int)vkGPU->device_id);
    if (res2 != cudaSuccess) {
        return VKFFT_ERROR_FAILED_TO_SET_DEVICE_ID;
    }
    res = cuDeviceGet(&vkGPU->device, (int)vkGPU->device_id);
    if (res != CUDA_SUCCESS) {
        return VKFFT_ERROR_FAILED_TO_GET_DEVICE;
    }
    res = cuDevicePrimaryCtxRetain(&vkGPU->context, (int)vkGPU->device);
    if (res != CUDA_SUCCESS) {
        return VKFFT_ERROR_FAILED_TO_CREATE_CONTEXT;
    }
    return VKFFT_SUCCESS;
}
VkFFTResult CUDAvkFFTEngine::gpuAllocateBuffers()
{
    cudaError_t res;
    CUDAPlan *plan = reinterpret_cast(m_currentPlan);
    // Allocate DMA accessible pinned memory, which may be faster than malloc'ed memory
    res = cudaHostAlloc(&plan->m_in, sizeof(Complex) * plan->n, cudaHostAllocMapped);
    if (res != cudaSuccess) {
        return VKFFT_ERROR_FAILED_TO_ALLOCATE;
    }
    res = cudaHostAlloc(&plan->m_out, sizeof(Complex) * plan->n, cudaHostAllocMapped);
    if (res != cudaSuccess) {
        return VKFFT_ERROR_FAILED_TO_ALLOCATE;
    }
    // Allocate GPU memory
    res = cudaMalloc((void**)&plan->m_buffer, sizeof(cuFloatComplex) * plan->n * 2);
    if (res != cudaSuccess) {
        return VKFFT_ERROR_FAILED_TO_ALLOCATE;
    }
    plan->m_configuration->buffer = (void**)&plan->m_buffer;
    return VKFFT_SUCCESS;
}
VkFFTResult CUDAvkFFTEngine::gpuConfigure()
{
    return VKFFT_SUCCESS;
}
void CUDAvkFFTEngine::transform()
{
    if (m_currentPlan)
    {
        CUDAPlan *plan = reinterpret_cast(m_currentPlan);
        cudaError_t res = cudaSuccess;
        void* buffer = ((void**)&plan->m_buffer)[0];
        // Transfer input from CPU to GPU memory
        PROFILER_START()
        res = cudaMemcpy(buffer, plan->m_in, plan->m_bufferSize, cudaMemcpyHostToDevice);
        PROFILER_STOP(QString("%1 TX %2").arg(getName()).arg(m_currentPlan->n))
        if (res != cudaSuccess) {
            qDebug() << "CUDAvkFFTEngine::transform: cudaMemcpy host to device failed";
        }
        // Perform FFT
        PROFILER_RESTART()
        VkFFTLaunchParams launchParams = {};
        VkFFTResult resFFT = VkFFTAppend(plan->m_app, plan->m_inverse ? 1 : -1, &launchParams);
        PROFILER_STOP(QString("%1 FFT %2").arg(getName()).arg(m_currentPlan->n))
        if (resFFT != VKFFT_SUCCESS) {
            qDebug() << "CUDAvkFFTEngine::transform: VkFFTAppend failed:" << getVkFFTErrorString(resFFT);
        }
        // Transfer result from GPU to CPU memory
        PROFILER_RESTART()
        res = cudaMemcpy(plan->m_out, buffer, plan->m_bufferSize, cudaMemcpyDeviceToHost);
        PROFILER_STOP(QString("%1 RX %2").arg(getName()).arg(m_currentPlan->n))
        if (res != cudaSuccess) {
            qDebug() << "CUDAvkFFTEngine::transform: cudaMemcpy device to host failed";
        }
    }
}
vkFFTEngine::Plan *CUDAvkFFTEngine::gpuAllocatePlan()
{
    return new CUDAPlan();
}
void CUDAvkFFTEngine::gpuDeallocatePlan(Plan *p)
{
    CUDAPlan *plan = reinterpret_cast(p);
    cudaFree(plan->m_in);
    plan->m_in = nullptr;
    cudaFree(plan->m_out);
    plan->m_out = nullptr;
    cudaFree(plan->m_buffer);
}