import { AudioContext, IAnalyserNode } from 'standardized-audio-context';
import { getPhonemes, phonemeToViseme, phonemeToVisemeOculus } from '@api/PhonemeProcessor'
import writtenNumber from 'written-number';

// This is a simple class that plays back a series of audio buffers and
// analyzes the speech content in real-time.
export interface SpeechSegment {
    buffer: Promise<ArrayBuffer | undefined>;
    text: string;
}

export const NullSpeechSegment: SpeechSegment = {
    buffer: Promise.resolve(undefined),
    text: ''
};

// This interface represents the phonemes that are generated by the Web Speech API.
// The timestamp represents the time at which the phoneme was generated.
// The phonemes array represents the phonemes that were generated for the text.
// The duration represents the duration of the phoneme.
// The pauseDuration represents the duration of the pause that follows the phoneme.
export interface Phoneme {
    timestamp: number;
    text: string;
    viseme: string;
    duration: number;
}

export interface OnPhonemeCallback {
    (phoneme: Phoneme): void;
}

// This class is responsible for playing back the audio buffers and analyzing the speech content in real-time.
// It uses the Web Audio API to play back the audio buffers and the AnalyserNode to analyze the speech content.
// The analyze method calculates the average amplitude of the audio signal and determines if it is a pause or speech.
// The playNext method plays back the next audio buffer in the queue and calls the analyze method.
// The play method adds the audio buffer to the queue and starts the playback process.
// The stopSpeechAnalyzer method stops the playback process and prints the processed text to the console.
// The recordPhoneme method records the current phoneme and adds it to the processed text array.
export class WebSpeechPlayer {

    chunks: Promise<SpeechSegment>[] = [];
    audioContext: AudioContext;
    analyzer: IAnalyserNode<AudioContext>;
    isPlaying: boolean = false;
    intervalId: any = undefined;

    onPhoneme?: OnPhonemeCallback;

    // The activePhonemes array represents the phonemes text that are currently being processed.
    activePhonemes: string[] = [];
    currPhIndex: number = -1;
    bufferDuration: number = 0;  // Buffer duration in seconds

    inPause = false;
    startTime: number = 0;
    thresholdMin: number = 0;
    thresholdMax: number = 0;
    pauseThreshold: number = 20;

    processedPhonemes: Phoneme[] = [];

    constructor(onPhonemeCallback?: OnPhonemeCallback) {
        this.audioContext = new AudioContext();
        this.analyzer = this.audioContext.createAnalyser();
        this.analyzer.fftSize = 1024;
        if (onPhonemeCallback) {
            this.onPhoneme = onPhonemeCallback;
        }
    }

    async loadAudio(buffer: ArrayBuffer): Promise<AudioBuffer> {
        return new Promise((resolve, reject) => {
            this.audioContext.decodeAudioData(buffer, resolve, reject);
        });
    }

    startSpeechAnalyzer(text: string) {
        this.isPlaying = true;
        this.currPhIndex = -1;
        console.log("Analyzing chunk: ", text);
        let currentTime = Date.now();
        this.startTime = currentTime;
        let words = text.split(/[^\w]+/);
        // Use writtenNumber to convert numbers to words and insert them as individual tokens in array
        for (let i = 0; i < words.length; i++) {
            if (!isNaN(Number(words[i]))) {
                let num = writtenNumber(words[i]);
                words.splice(i, 1, ...num.split(" "));
            }
        }
        this.activePhonemes = [];
        this.processedPhonemes = [];
        for (let i = 0; i < words.length; i++) {
            let phonemes = getPhonemes(words[i]);
            if (phonemes !== undefined) {
                this.activePhonemes.push(...phonemes);
            }
        }
        console.log("Phonemes: ", this.activePhonemes);
        // Assume that the audio is paused at the beginning
        this.thresholdMin = Number.MAX_VALUE;
        this.thresholdMax = Number.MIN_VALUE;
        this.inPause = true;
        // 5ms interval to analyze the audio signal
        this.intervalId = setInterval(() => {
            this.analyze();
        }, 5);
    }

    stopSpeechAnalyzer() {
        this.isPlaying = false;
        this.recordPhoneme();
        this.recordSilence();
        if (this.intervalId !== undefined) {
            clearInterval(this.intervalId);
            this.intervalId = undefined;
        }
    }

    recordSilence() {
        let pho: Phoneme = {
            timestamp: Date.now(),
            text: "",
            viseme: phonemeToVisemeOculus("SIL"),
            duration: 0
        };
        this.processedPhonemes.push(pho);
        this.onPhoneme?.(pho);
    }

    recordPhoneme() {
        let currentTime = Date.now();
        if (this.bufferDuration === 0) {
            return;
        }
        // Total duration of the audio buffer that has been played so far.
        let duration = (currentTime - this.startTime) / 1000;
        let estimatedIndex = Math.floor(this.activePhonemes.length * duration / this.bufferDuration);
        if (!this.isPlaying) {
            // Last phoneme. Make sure we record it.
            estimatedIndex = this.activePhonemes.length - 1;
        }
        if (estimatedIndex <= this.currPhIndex || estimatedIndex > this.activePhonemes.length - 1)
        {
            return;
        }
        if (this.currPhIndex > this.activePhonemes.length-1)
        {
            return;
        }
        this.currPhIndex++;
        let txt: string = this.activePhonemes[this.currPhIndex];
        let prevTime = this.processedPhonemes.length > 0 ? this.processedPhonemes[this.processedPhonemes.length - 1].timestamp : this.startTime;
        let pho: Phoneme = {
            timestamp: currentTime,
            // Original text
            text: txt,
            // Convert the phoneme to viseme
            viseme: phonemeToVisemeOculus(txt),
            // Calculate the duration of the phoneme in ms
            duration: (currentTime - prevTime)
        };
        this.processedPhonemes.push(pho);
        this.onPhoneme?.(pho);
        // Print the phoneme and corresponding FFT average min/max amplitude at the time of recording
        console.log(
            `[${estimatedIndex}] Phoneme: `, pho.text, "Viseme: ", pho.viseme,
            `Duration: ${pho.duration} ms`,
            this.thresholdMin, this.thresholdMax);

        this.thresholdMin = Number.MAX_VALUE;
        this.thresholdMax = Number.MIN_VALUE;
    }

    analyze() {
        const dataArray = new Uint8Array(this.analyzer.frequencyBinCount);
        this.analyzer.getByteFrequencyData(dataArray);
        // Calculate the average amplitude
        const sum = dataArray.slice(2, 7).reduce((total, num) => total + num, 0);
        let currentTime = Date.now();
        if (sum < this.thresholdMin)
        {
            this.thresholdMin = sum;
        }
        if (sum > this.thresholdMax)
        {
            this.thresholdMax = sum;
        }
        this.recordPhoneme();
    }

    async playNext(): Promise<void> {
        if (this.chunks.length === 0) {
            console.log("End of playback #1");
            this.stopSpeechAnalyzer();
            return;
        }

        const chunkPromise = this.chunks.shift();
        if (typeof chunkPromise === 'undefined') {
            await this.playNext();
            return;
        }

        const chunk = await chunkPromise;
        if (chunk === NullSpeechSegment || typeof chunk === 'undefined' || chunk.text === '') {
            await this.playNext();
            return;
        }

        const buffer = await chunk.buffer;
        if (typeof buffer === 'undefined') {
            await this.playNext();
            return;
        }

        const audioBuffer = await this.loadAudio(buffer);
        const source = this.audioContext.createBufferSource();
        source.buffer = audioBuffer;
        source.connect(this.audioContext.destination);
        source.connect(this.analyzer);
        this.analyzer.connect(this.audioContext.destination);
        source.addEventListener('ended', async () => {
            source.stop();
            source.disconnect();
            this.analyzer.disconnect();
            await this.playNext();
        });
        this.bufferDuration = audioBuffer.duration;
        source.start();
        this.startSpeechAnalyzer(chunk.text);
    }

    public async play(chunk: Promise<SpeechSegment>): Promise<void> {
        this.chunks.push(chunk);
        if (!this.isPlaying) {
            this.isPlaying = true;
            await this.playNext();
        }
    }
}
