admin管理员组

文章数量:1279055

I'm using ElevenLabs Conversational AI API, which uses Websockets. And I'm using Unity C# on the client side.

I can receive audio just fine, but when I send my reply to it, I don't get a response.

After I send my audio, the server just keeps sending "ping" forever (to which I reply with the "pong").

What's strange is when I go to the ElevenLabs website, and check the call history and play it, it appears that I actually did send my audio, but the AI hangs up shortly as if I didn't say anything.

Just to make sure it isn't just AI being stupid, I talked with it directly on the site and it works perfectly.

Am I doing something wrong? I've tried changing websocket libraries, json parsers, and more, but nothing seems to work.

This is the doc I was following:

here's the code:

using UnityEngine;
using NativeWebSocket;
using System;
using System.Threading.Tasks;
using System.Net.Http;
using System.Collections;

class SendSound : MonoBehaviour
{
    public AudioClip clip;
    string userAudioSend;
    AudioSource audioSource;
    AudioClip latestResponse;
    WebSocket websocket;

    [System.Serializable]
    struct MessageType
    {
        public string type;
    };

    [System.Serializable]
    struct PingEvent
    {
        public int event_id;
    }

    [System.Serializable]
    struct MessagePing
    {
        public MessageType type;
        public PingEvent ping_event;
    }

    [System.Serializable]
    struct SignedURL
    {
        public string signed_url;
    }

    [System.Serializable]
    struct AudioEvent
    {
        public string audio_base_64;
        public int event_id;
    }

    [System.Serializable]
    struct AudioResponse
    {
        public string type;
        public AudioEvent audio_event;
    }


    private async void Start()
    {
        audioSource = GetComponent<AudioSource>();

        string signedUrlJson= await GetSignedUrl("MY_API_KEY", "AGENT_ID");
        SignedURL signedUrl = JsonUtility.FromJson<SignedURL>(signedUrlJson);
        websocket = new WebSocket(signedUrl.signed_url);

        websocket.OnOpen += () =>
        {
            Debug.Log("Connection open!");
            if(websocket.State != WebSocketState.Open)
            {
                Debug.LogError("Not open");
            }
            websocket.SendText("{ \"type\": \"conversation_initiation_client_data\" }");
        };

        websocket.OnError += (e) =>
        {
            Debug.Log("Error! " + e);
        };

        websocket.OnClose += (e) =>
        {
            Debug.Log("Connection closed!");
        };

        websocket.OnMessage += (bytes) =>
        {
            Debug.Log("OnMessage!");

            var message = System.Text.Encoding.UTF8.GetString(bytes);
            Debug.Log(message);
            
         
            MessageType mt = JsonUtility.FromJson<MessageType>(message);
            switch(mt.type)
            {
                case "ping":
                    MessagePing meping = JsonUtility.FromJson<MessagePing>(message);
                    Debug.Log("WE PONGED: " + "{\"type\":\"pong\",\"event_id\":" + meping.ping_event.event_id + " }");
                    websocket.SendText("{\"type\":\"pong\",\"event_id\":" + meping.ping_event.event_id + "}");
                    break;
                case "audio":
                    AudioResponse audioResponse = JsonUtility.FromJson<AudioResponse>(message);
                    latestResponse = Base64ToAudioClip(audioResponse.audio_event.audio_base_64);
                    break;
                case "interruption":
                    audioSource.Stop();
                    break;
            }
        };
        
        userAudioSend = "{\"user_audio_chunk\":\"" + ConvertAudioClipToBase64(clip) + "\"}";

        StartCoroutine(DispatchQueue());
        StartCoroutine(UpdateSound());
        Invoke(nameof(SendAudio), 3.5f);

        await websocket.Connect();
    }

    async Task SendAudio()
    {

        if (websocket.State == WebSocketState.Open)
        {
            Debug.Log("WE SENT AUDIO");
            await websocket.SendText(userAudioSend);
        }
        else
        {
            Debug.Log("NOT SEND: " + Enum.GetName(websocket.State.GetType(), websocket.State));
        }
    }

    IEnumerator UpdateSound()
    {
        while (true)
        {
            if (audioSource.clip != latestResponse && !audioSource.isPlaying)
            {
                audioSource.clip = latestResponse;
                audioSource.Play();
            }
            yield return null;
        }
    }

    IEnumerator DispatchQueue()
    {
        while(true)
        {
            websocket.DispatchMessageQueue();
            yield return null;
        }
    }

    private async void OnApplicationQuit()
    {
        await websocket.Close();
    }

    public static string ConvertAudioClipToBase64(AudioClip clip)
    {
        if (clip == null) return null;

        // 1. Resample to 16000Hz if needed
        float[] samples = ResampleTo16000(clip);

        // 2. Convert to 16-bit PCM bytes
        byte[] pcmBytes = ConvertTo16BitPCM(samples);

        // 3. Base64 encode (RAW PCM, NO HEADER)
        return Convert.ToBase64String(pcmBytes);
    }

    // Resample audio to 16000Hz
    private static float[] ResampleTo16000(AudioClip clip)
    {
        float[] original = new float[clip.samples * clip.channels];
        clip.GetData(original, 0);

        if (clip.frequency == 16000) return original;

        float ratio = (float)clip.frequency / 16000f;
        float[] resampled = new float[Mathf.RoundToInt(original.Length / ratio)];

        for (int i = 0; i < resampled.Length; i++)
        {
            float srcIndex = i * ratio;
            int prevIndex = Mathf.FloorToInt(srcIndex);
            int nextIndex = Mathf.Min(prevIndex + 1, original.Length - 1);
            float lerp = srcIndex - prevIndex;

            resampled[i] = Mathf.Lerp(original[prevIndex], original[nextIndex], lerp);
        }

        return resampled;
    }

    // Convert float[-1,1] samples to 16-bit PCM
    private static byte[] ConvertTo16BitPCM(float[] samples)
    {
        byte[] pcm = new byte[samples.Length * 2];

        for (int i = 0; i < samples.Length; i++)
        {
            // Clamp and convert to 16-bit range
            short value = (short)(Mathf.Clamp(samples[i], -1f, 1f) * short.MaxValue);
            Buffer.BlockCopy(BitConverter.GetBytes(value), 0, pcm, i * 2, 2);
        }

        return pcm;
    }

    public static AudioClip Base64ToAudioClip(string base64String)
    {
        // Decode the Base64 string into a byte array
        byte[] audioBytes = Convert.FromBase64String(base64String);

        // Calculate the number of samples (16-bit, so 2 bytes per sample)
        int sampleCount = audioBytes.Length / 2;
        float[] samples = new float[sampleCount];

        // Convert each 16-bit sample to a normalized float
        for (int i = 0; i < sampleCount; i++)
        {
            int offset = i * 2;
            short sampleInt16 = BitConverter.ToInt16(audioBytes, offset);
            samples[i] = sampleInt16 / 32768.0f; // Normalize to [-1, 1]
        }

        // Create the AudioClip (mono, 16000 Hz)
        AudioClip audioClip = AudioClip.Create(
            "DecodedAudio",
            sampleCount,
            1, // Mono
            16000,
            false // Do not stream
        );

        // Set the sample data
        audioClip.SetData(samples, 0);

        return audioClip;
    }
    public static async Task<string> GetSignedUrl(string apiKey, string agentId)
    {
        var baseUrl = ";;
        var url = $"{baseUrl}?agent_id={Uri.EscapeDataString(agentId)}";

        Debug.Log("Final URL: " + url);
        using var httpClient = new HttpClient();
        var request = new HttpRequestMessage(HttpMethod.Get, url);
        request.Headers.Add("xi-api-key", apiKey);

        try
        {
            var response = await httpClient.SendAsync(request);
            if (response.IsSuccessStatusCode)
            {
                return await response.Content.ReadAsStringAsync();
            }
            else
            {
                Debug.Log("WE ERORRED");
                return $"Error: {response.StatusCode}";
            }
        }
        catch (HttpRequestException ex)
        {
            return $"Request error: {ex.Message}";
        }
    }

}

本文标签: cElevenLabs Websocket server not sending responseStack Overflow