diff options
author | Kae <80987908+Novaenia@users.noreply.github.com> | 2023-07-14 13:13:19 +1000 |
---|---|---|
committer | Kae <80987908+Novaenia@users.noreply.github.com> | 2023-07-14 13:13:19 +1000 |
commit | 3b38825b34ebb95b59989934dc849858cee42c97 (patch) | |
tree | f044311d8cbf1d687bfcf807e5bf8764406ba000 | |
parent | f14f77724d6a06b759280c0885ce1102388faa38 (diff) |
more voice stuff
-rw-r--r-- | assets/opensb/binds/opensb.binds | 14 | ||||
-rw-r--r-- | source/application/StarMainApplication_sdl.cpp | 4 | ||||
-rw-r--r-- | source/client/StarClientApplication.cpp | 8 | ||||
-rw-r--r-- | source/frontend/StarVoice.cpp | 279 | ||||
-rw-r--r-- | source/frontend/StarVoice.hpp | 76 |
5 files changed, 322 insertions, 59 deletions
diff --git a/assets/opensb/binds/opensb.binds b/assets/opensb/binds/opensb.binds index b8a13c2..2f84e37 100644 --- a/assets/opensb/binds/opensb.binds +++ b/assets/opensb/binds/opensb.binds @@ -1,7 +1,8 @@ { "opensb": { "groups": { - "camera": { "name": "Camera" } + "camera": { "name": "Camera" }, + "voice": { "name": "Voice" } }, "name": "Open^#ebd74a;Starbound", "binds": { @@ -21,13 +22,10 @@ "group" : "camera", "name": "Zoom Out" }, - "test": { - "default": [{ - "type": "key", - "value": "C", - "mods": ["LShift"] - }], - "name": "Test Bind" + "pushToTalk": { + "default": [], + "group" : "voice", + "name": "Push To Talk" } } } diff --git a/source/application/StarMainApplication_sdl.cpp b/source/application/StarMainApplication_sdl.cpp index 86b9a10..1685cc0 100644 --- a/source/application/StarMainApplication_sdl.cpp +++ b/source/application/StarMainApplication_sdl.cpp @@ -344,8 +344,10 @@ public: SDL_AudioSpec obtained = {}; m_sdlAudioInputDevice = SDL_OpenAudioDevice(name, 1, &desired, &obtained, 0); - if (m_sdlAudioInputDevice) + if (m_sdlAudioInputDevice) { Logger::info("Opened audio input device '{}'", SDL_GetAudioDeviceName(m_sdlAudioInputDevice, 1)); + SDL_PauseAudioDevice(m_sdlAudioInputDevice, 0); + } else Logger::info("Failed to open audio input device: {}", SDL_GetError()); diff --git a/source/client/StarClientApplication.cpp b/source/client/StarClientApplication.cpp index f1c7595..8093582 100644 --- a/source/client/StarClientApplication.cpp +++ b/source/client/StarClientApplication.cpp @@ -375,6 +375,14 @@ void ClientApplication::update() { else if (m_state > MainAppState::Title) updateRunning(); + { // testing + m_voice->setLocalSpeaker(0); + m_voice->setInput(m_input->bindHeld("opensb", "pushToTalk")); + DataStreamBuffer data; + if (m_voice->send(data, 5000)) + m_voice->receive(m_voice->speaker(0), std::string_view(data.ptr(), data.size())); + } + m_guiContext->cleanup(); m_edgeKeyEvents.clear(); m_input->reset(); diff --git a/source/frontend/StarVoice.cpp b/source/frontend/StarVoice.cpp index e5cb299..fcfaa35 100644 --- a/source/frontend/StarVoice.cpp +++ b/source/frontend/StarVoice.cpp @@ -3,9 +3,9 @@ #include "StarApplicationController.hpp" #include "StarTime.hpp" #include "StarRoot.hpp" +#include "StarLogging.hpp" #include "opus/include/opus.h" -#include <queue> #include "SDL.h" constexpr int VOICE_SAMPLE_RATE = 48000; @@ -59,45 +59,10 @@ float getAudioLoudness(int16_t* data, size_t samples) { return highest; } -struct VoiceAudioChunk { - std::unique_ptr<int16_t[]> data; - size_t remaining; - size_t offset = 0; - - VoiceAudioChunk(int16_t* ptr, size_t size) { - data.reset(ptr); - remaining = size; - offset = 0; - } - - inline size_t takeSamples(std::vector<int16_t>& out, size_t count) { - size_t toRead = std::min<size_t>(count, remaining); - int16_t* start = data.get() + offset; - out.insert(out.end(), start, start + toRead); - offset += toRead; - remaining -= toRead; - return toRead; - } - - //this one's unsafe - inline int16_t takeSample() { - --remaining; - return *(data.get() + offset++); - } - - inline bool exhausted() { - return remaining == 0; - } -}; - struct VoiceAudioStream { // TODO: This should really be a ring buffer instead. std::queue<VoiceAudioChunk> chunks{}; size_t samples = 0; - atomic<bool> muted = false; - atomic<bool> playing = false; - atomic<float> decibelLevel = 0.0f; - atomic<Array<float, 2>> channelVolumes = Array<float, 2>::filled(1.0f); Mutex mutex; @@ -225,8 +190,8 @@ void Voice::save() const { } void Voice::scheduleSave() { - if (nextSaveTime == 0.0) - nextSaveTime = Time::monotonicTime() + 2.0; + if (!m_nextSaveTime) + m_nextSaveTime = Time::monotonicMilliseconds() + 2000; } Voice::SpeakerPtr Voice::setLocalSpeaker(SpeakerId speakerId) { @@ -248,19 +213,130 @@ Voice::SpeakerPtr Voice::speaker(SpeakerId speakerId) { } } -void Voice::getAudioData(uint8_t* stream, int len) { +void Voice::readAudioData(uint8_t* stream, int len) { + auto now = Time::monotonicMilliseconds(); + if (!m_encoder || m_inputMode == VoiceInputMode::PushToTalk && now > m_lastInputTime) + return; + + // Stop encoding if 2048 bytes have been encoded and not taken by the game thread yet + if (m_encodedChunksLength > 2048) + return; + + size_t samples = len / 2; + float decibels = getAudioLoudness((int16_t*)stream, samples); + m_clientSpeaker->decibelLevel = decibels; + + bool active = true; + + if (m_inputMode == VoiceInputMode::VoiceActivity) { + bool aboveThreshold = decibels > m_threshold; + if (aboveThreshold) + m_lastThresholdTime = now; + active = now - m_lastThresholdTime < 50; + } + + if (active) { + m_capturedChunksFrames += samples / m_deviceChannels; + auto data = (opus_int16*)malloc(len); + memcpy(data, stream, len); + m_capturedChunks.emplace(data, samples); + } + else { // Clear out any residual data so they don't manifest at the start of the next encode, whenever that is + while (!m_capturedChunks.empty()) + m_capturedChunks.pop(); + + m_capturedChunksFrames = 0; + } + + std::vector<opus_int16> takenSamples; + while (m_capturedChunksFrames >= VOICE_FRAME_SIZE) { + takenSamples.clear(); + size_t samplesToTake = VOICE_FRAME_SIZE * (size_t)m_deviceChannels; + takenSamples.reserve(samplesToTake); + + while (!m_capturedChunks.empty()) { + auto& front = m_capturedChunks.front(); + if (front.exhausted()) + m_capturedChunks.pop(); + else if ((samplesToTake -= front.takeSamples(takenSamples, samplesToTake)) == 0) + break; + } + m_capturedChunksFrames -= VOICE_FRAME_SIZE; + + ByteArray encodedData(VOICE_MAX_PACKET_SIZE, 0); + float vol = m_inputVolume; + if (m_inputVolume != 1.0f) { + for (size_t i = 0; i != takenSamples.size(); ++i) + takenSamples[i] *= m_inputVolume; + } + + + if (opus_int32 size = opus_encode(m_encoder.get(), takenSamples.data(), VOICE_FRAME_SIZE, (unsigned char*)encodedData.ptr(), VOICE_MAX_PACKET_SIZE)) { + if (size == 1) + continue; + encodedData.resize(size); + MutexLocker lock(m_captureMutex); + m_encodedChunks.emplace_back(move(encodedData)); // reset takes ownership of data buffer + m_encodedChunksLength += size; + Logger::info("Voice: encoded Opus chunk {} bytes big", size); + } + else if (size < 0) { + Logger::error("Voice: Opus encode error {}", opus_strerror(size)); + } + } } -void Voice::mix(int16_t* buffer, size_t frames, unsigned channels) { +void Voice::mix(int16_t* buffer, size_t samples, unsigned channels) { + static std::vector<int16_t> finalMixBuffer{}; + static std::vector<int32_t> voiceMixBuffer{}; + finalMixBuffer.resize(samples); + voiceMixBuffer.resize(samples); + int32_t* mixBuf = (int32_t*)memset(voiceMixBuffer.data(), 0, samples * sizeof(int32_t)); + //read into buffer now + bool mix = false; + { + MutexLocker lock(m_activeSpeakersMutex); + auto it = m_activeSpeakers.begin(); + while (it != m_activeSpeakers.end()) { + SpeakerPtr const& speaker = *it; + VoiceAudioStream* audio = speaker->audioStream.get(); + MutexLocker audioLock(audio->mutex); + if (!audio->empty()) { + if (!speaker->muted) { + mix = true; + auto channelVolumes = speaker->channelVolumes.load(); + for (size_t i = 0; i != samples; ++i) + mixBuf[i] += (int32_t)(audio->getSample()) * channelVolumes[i % 2]; + } + else { + for (size_t i = 0; i != samples; ++i) + audio->getSample(); + } + ++it; + } + else { + speaker->playing = false; + it = m_activeSpeakers.erase(it); + } + } + } + if (mix) { + int16_t* finBuf = finalMixBuffer.data(); + + float vol = m_outputVolume; + for (size_t i = 0; i != samples; ++i) + finBuf[i] = (int16_t)std::clamp<int>(mixBuf[i] * vol, INT16_MIN, INT16_MAX); + SDL_MixAudioFormat((Uint8*)buffer, (Uint8*)finBuf, AUDIO_S16, samples * sizeof(int16_t), SDL_MIX_MAXVOLUME); + } } void Voice::update(PositionalAttenuationFunction positionalAttenuationFunction) { if (positionalAttenuationFunction) { for (auto& entry : m_speakers) { if (SpeakerPtr& speaker = entry.second) { - speaker->audioStream->channelVolumes = { + speaker->channelVolumes = { positionalAttenuationFunction(0, speaker->position, 1.0f), positionalAttenuationFunction(1, speaker->position, 1.0f) }; @@ -268,9 +344,8 @@ void Voice::update(PositionalAttenuationFunction positionalAttenuationFunction) } } - auto now = Time::monotonicTime(); - if (now > nextSaveTime) { - nextSaveTime = 0.0; + if (Time::monotonicMilliseconds() > m_nextSaveTime) { + m_nextSaveTime = 0; save(); } } @@ -285,6 +360,97 @@ void Voice::setDeviceName(Maybe<String> deviceName) { openDevice(); } +int Voice::send(DataStreamBuffer& out, size_t budget) { + out.setByteOrder(ByteOrder::LittleEndian); + out.write<uint16_t>(VOICE_VERSION); + MutexLocker captureLock(m_captureMutex); + + if (!m_encoder || m_capturedChunks.empty()) + return 0; + + std::vector<ByteArray> encodedChunks = move(m_encodedChunks); + size_t encodedChunksLength = m_encodedChunksLength; + m_encodedChunksLength = 0; + captureLock.unlock(); + + for (auto& chunk : encodedChunks) { + out.write<uint32_t>(chunk.size()); + out.writeBytes(chunk); + if ((budget -= min<size_t>(budget, chunk.size())) == 0) + break; + } + + m_lastSentTime = Time::monotonicMilliseconds(); + return 1; +} + +bool Voice::receive(SpeakerPtr speaker, std::string_view view) { + if (!speaker || view.empty()) + return false; + + try { + DataStreamExternalBuffer reader(view.data(), view.size()); + reader.setByteOrder(ByteOrder::LittleEndian); + + if (reader.read<uint16_t>() > VOICE_VERSION) + return false; + + uint32_t opusLength = 0; + while (!reader.atEnd()) { + reader >> opusLength; + auto opusData = (unsigned char*)reader.ptr() + reader.pos(); + reader.seek(opusLength, IOSeek::Relative); + + int channels = opus_packet_get_nb_channels(opusData); + if (channels == OPUS_INVALID_PACKET) + continue; + + bool mono = channels == 1; + OpusDecoder* decoder = mono ? speaker->decoderMono.get() : speaker->decoderStereo.get(); + int samples = opus_decoder_get_nb_samples(decoder, opusData, opusLength); + if (samples < 0) + throw VoiceException(strf("Decoder error: {}", opus_strerror(samples)), false); + + size_t decodeBufferSize = samples * sizeof(opus_int16) * (size_t)channels; + opus_int16* decodeBuffer = (opus_int16*)malloc(decodeBufferSize); + + int decodedSamples = opus_decode(decoder, opusData, opusLength, decodeBuffer, decodeBufferSize, 0); + if (decodedSamples < 0) { + free(decodeBuffer); + throw VoiceException(strf("Decoder error: {}", opus_strerror(samples)), false); + } + + static auto getCVT = [](int channels) -> SDL_AudioCVT { + SDL_AudioCVT cvt; + SDL_BuildAudioCVT(&cvt, AUDIO_S16SYS, channels, VOICE_SAMPLE_RATE, AUDIO_S16, 2, 44100); + return cvt; + }; + + //TODO: This isn't the best way to resample to 44100 hz because SDL_ConvertAudio is not for streamed audio. + static SDL_AudioCVT monoCVT = getCVT(1); + static SDL_AudioCVT stereoCVT = getCVT(2); + SDL_AudioCVT& cvt = mono ? monoCVT : stereoCVT; + cvt.len = decodedSamples * sizeof(opus_int16) * (size_t)channels; + cvt.buf = (Uint8*)realloc(decodeBuffer, (size_t)(cvt.len * cvt.len_mult)); + SDL_ConvertAudio(&cvt); + + size_t reSamples = (size_t)cvt.len_cvt / 2; + speaker->decibelLevel = getAudioLoudness((int16_t*)cvt.buf, reSamples); + speaker->audioStream->take((opus_int16*)realloc(cvt.buf, cvt.len_cvt), reSamples); + playSpeaker(speaker, channels); + } + return true; + } + catch (StarException const& e) { + Logger::error("Voice: Error receiving voice data for speaker #{} ('{}'): {}", speaker->speakerId, speaker->name, e.what()); + return false; + } +} + +void Voice::setInput(bool input) { + m_lastInputTime = input ? Time::monotonicMilliseconds() + 1000 : 0; +} + OpusDecoder* Voice::createDecoder(int channels) { int error; OpusDecoder* decoder = opus_decoder_create(VOICE_SAMPLE_RATE, channels, &error); @@ -312,9 +478,17 @@ void Voice::resetEncoder() { void Voice::openDevice() { closeDevice(); - m_applicationController->openAudioInputDevice(m_deviceName ? m_deviceName->utf8Ptr() : nullptr, VOICE_SAMPLE_RATE, encoderChannels(), this, [](void* userdata, uint8_t* stream, int len) { - ((Voice*)(userdata))->getAudioData(stream, len); - }); + + + m_applicationController->openAudioInputDevice( + m_deviceName ? m_deviceName->utf8Ptr() : nullptr, + VOICE_SAMPLE_RATE, + m_deviceChannels = encoderChannels(), + this, + [](void* userdata, uint8_t* stream, int len) { + ((Voice*)(userdata))->readAudioData(stream, len); + } + ); m_deviceOpen = true; } @@ -328,4 +502,15 @@ void Voice::closeDevice() { m_deviceOpen = false; } +bool Voice::playSpeaker(SpeakerPtr const& speaker, int channels) { + unsigned int minSamples = speaker->minimumPlaySamples * channels; + if (speaker->playing || speaker->audioStream->samples < minSamples) + return false; + + speaker->playing = true; + MutexLocker lock(m_activeSpeakersMutex); + m_activeSpeakers.insert(speaker); + return true; +} + }
\ No newline at end of file diff --git a/source/frontend/StarVoice.hpp b/source/frontend/StarVoice.hpp index 269adb4..e7ecd80 100644 --- a/source/frontend/StarVoice.hpp +++ b/source/frontend/StarVoice.hpp @@ -6,8 +6,11 @@ #include "StarGameTypes.hpp" #include "StarMaybe.hpp" #include "StarThread.hpp" +#include "StarDataStreamDevices.hpp" #include "StarApplicationController.hpp" +#include <queue> + struct OpusDecoder; typedef std::unique_ptr<OpusDecoder, void(*)(OpusDecoder*)> OpusDecoderPtr; struct OpusEncoder; @@ -27,6 +30,36 @@ STAR_CLASS(Voice); STAR_CLASS(VoiceAudioStream); STAR_CLASS(ApplicationController); +struct VoiceAudioChunk { + std::unique_ptr<int16_t[]> data; + size_t remaining; + size_t offset = 0; + + VoiceAudioChunk(int16_t* ptr, size_t size) { + data.reset(ptr); + remaining = size; + offset = 0; + } + + inline size_t takeSamples(std::vector<int16_t>& out, size_t count) { + size_t toRead = std::min<size_t>(count, remaining); + int16_t* start = data.get() + offset; + out.insert(out.end(), start, start + toRead); + offset += toRead; + remaining -= toRead; + return toRead; + } + + //this one's unsafe + inline int16_t takeSample() { + --remaining; + return *(data.get() + offset++); + } + + inline bool exhausted() { return remaining == 0; } +}; + + class Voice { public: // Individual speakers are represented by their connection ID. @@ -45,6 +78,13 @@ public: VoiceAudioStreamPtr audioStream; Mutex mutex; + atomic<bool> muted = false; + atomic<bool> playing = false; + atomic<float> decibelLevel = 0.0f; + atomic<Array<float, 2>> channelVolumes = Array<float, 2>::filled(1.0f); + + unsigned int minimumPlaySamples = 4096; + Speaker(SpeakerId speakerId); }; @@ -77,7 +117,7 @@ public: SpeakerPtr speaker(SpeakerId speakerId); // Called when receiving input audio data from SDL, on its own thread. - void getAudioData(uint8_t* stream, int len); + void readAudioData(uint8_t* stream, int len); // Called to mix voice audio with the game. void mix(int16_t* buffer, size_t frames, unsigned channels); @@ -87,6 +127,12 @@ public: void setDeviceName(Maybe<String> device); + int send(DataStreamBuffer& out, size_t budget); + bool receive(SpeakerPtr speaker, std::string_view view); + + // Must be called every frame with input state, expires after 1s. + void setInput(bool input = true); + inline int encoderChannels() const { return m_channelMode == VoiceChannelMode::Mono ? 1 : 2; } @@ -99,10 +145,13 @@ private: void openDevice(); void closeDevice(); + bool playSpeaker(SpeakerPtr const& speaker, int channels); + SpeakerId m_speakerId = 0; SpeakerPtr m_clientSpeaker; HashMap<SpeakerId, SpeakerPtr> m_speakers; + Mutex m_activeSpeakersMutex; HashSet<SpeakerPtr> m_activeSpeakers; OpusEncoderPtr m_encoder; @@ -110,10 +159,15 @@ private: float m_outputVolume = 1.0f; float m_inputVolume = 1.0f; float m_threshold = -50.0f; - + + int64_t m_lastSentTime = 0; + int64_t m_lastInputTime = 0; + int64_t m_lastThresholdTime = 0; + int64_t m_nextSaveTime = 0; bool m_enabled = true; bool m_inputEnabled = true; + int m_deviceChannels = 1; bool m_deviceOpen = false; Maybe<String> m_deviceName; VoiceInputMode m_inputMode; @@ -121,7 +175,23 @@ private: ApplicationControllerPtr m_applicationController; - double nextSaveTime = 0.0f; + struct EncodedChunk { + std::unique_ptr<unsigned char[]> data; + size_t size; + + EncodedChunk(unsigned char* _data, size_t len) { + data.reset(_data); + size = len; + } + }; + + std::vector<ByteArray> m_encodedChunks; + size_t m_encodedChunksLength = 0; + + std::queue<VoiceAudioChunk> m_capturedChunks; + size_t m_capturedChunksFrames = 0; + + Mutex m_captureMutex; }; } |