51 lines
1.8 KiB
Python
51 lines
1.8 KiB
Python
# speech to Audio2Face module utilizing the gRPC protocal from audio2face_streaming_utils
|
|
import io
|
|
from pydub import AudioSegment
|
|
from scipy.io.wavfile import read
|
|
import numpy as np
|
|
from audio2face_streaming_utils import push_audio_track
|
|
|
|
|
|
class Audio2FaceService:
|
|
def __init__(self, sample_rate=44100):
|
|
"""
|
|
:param sample_rate: sample rate
|
|
"""
|
|
self.a2f_url = 'localhost:50051' # Set it to the port of your local host
|
|
self.sample_rate = 44100
|
|
self.avatar_instance = '/World/audio2face/PlayerStreaming' # Set it to the name of your Audio2Face Streaming Instance
|
|
|
|
def tts_to_wav(self, tts_byte, framerate=22050) -> str:
|
|
"""
|
|
:param tts_byte: tts data in byte
|
|
:param framerate: framerate
|
|
:return: wav byte
|
|
"""
|
|
seg = AudioSegment.from_raw(io.BytesIO(tts_byte), sample_width=2, frame_rate=22050, channels=1)
|
|
wavIO = io.BytesIO()
|
|
seg.export(wavIO, format="wav")
|
|
rate, wav = read(io.BytesIO(wavIO.getvalue()))
|
|
return wav
|
|
|
|
def wav_to_numpy_float32(self, wav_byte) -> float:
|
|
"""
|
|
:param wav_byte: wav byte
|
|
:return: float32
|
|
"""
|
|
return wav_byte.astype(np.float32, order='C') / 32768.0
|
|
|
|
def get_tts_numpy_audio(self, audio) -> float:
|
|
"""
|
|
:param audio: audio from tts_to_wav
|
|
:return: float32 of the audio
|
|
"""
|
|
wav_byte = self.tts_to_wav(audio)
|
|
return self.wav_to_numpy_float32(wav_byte)
|
|
|
|
def make_avatar_speaks(self, audio) -> None:
|
|
"""
|
|
:param audio: tts audio
|
|
:return: None
|
|
"""
|
|
push_audio_track(self.a2f_url, self.get_tts_numpy_audio(audio), self.sample_rate, self.avatar_instance)
|