From 3633aa99e5d04769e4ff2c405c17693748e816d8 Mon Sep 17 00:00:00 2001 From: George Kasparyants Date: Wed, 24 Apr 2024 06:57:30 +0400 Subject: [PATCH] initial commit --- .gitignore | 8 + a2f_api.py | 216 ++++++++++++ audio2face.py | 50 +++ audio2face_pb2.py | 502 ++++++++++++++++++++++++++++ audio2face_pb2_grpc.py | 122 +++++++ audio2face_streaming_utils.py | 142 ++++++++ emotalk_own/Dockerfile | 54 +++ emotalk_own/LICENSE | 13 + emotalk_own/blender.sh | 4 + emotalk_own/demo.py | 111 ++++++ emotalk_own/model.py | 144 ++++++++ emotalk_own/readme.md | 103 ++++++ emotalk_own/render.py | 87 +++++ emotalk_own/requirements.txt | 5 + emotalk_own/utils.py | 39 +++ emotalk_own/wav2vec.py | 245 ++++++++++++++ main.py | 4 + miapia_own/FemAdv_b350_V2_050523.py | 164 +++++++++ miapia_own/__init__.py | 0 miapia_own/a.py | 57 ++++ miapia_own/aihandler.py | 36 ++ miapia_own/main.py | 243 ++++++++++++++ miapia_own/pieinfer.py | 153 +++++++++ miapia_stream/__init__.py | 0 server/connect.sh | 1 + server/sync_code.sh | 1 + server/sync_code_mia.sh | 1 + t2a_api.py | 12 + test_a2f_api.py | 38 +++ 29 files changed, 2555 insertions(+) create mode 100644 .gitignore create mode 100644 a2f_api.py create mode 100644 audio2face.py create mode 100644 audio2face_pb2.py create mode 100644 audio2face_pb2_grpc.py create mode 100644 audio2face_streaming_utils.py create mode 100644 emotalk_own/Dockerfile create mode 100644 emotalk_own/LICENSE create mode 100755 emotalk_own/blender.sh create mode 100644 emotalk_own/demo.py create mode 100644 emotalk_own/model.py create mode 100644 emotalk_own/readme.md create mode 100644 emotalk_own/render.py create mode 100644 emotalk_own/requirements.txt create mode 100644 emotalk_own/utils.py create mode 100755 emotalk_own/wav2vec.py create mode 100644 main.py create mode 100644 miapia_own/FemAdv_b350_V2_050523.py create mode 100644 miapia_own/__init__.py create mode 100644 miapia_own/a.py create mode 100644 miapia_own/aihandler.py create mode 100644 miapia_own/main.py create mode 100644 miapia_own/pieinfer.py create mode 100644 miapia_stream/__init__.py create mode 100755 server/connect.sh create mode 100755 server/sync_code.sh create mode 100755 server/sync_code_mia.sh create mode 100644 t2a_api.py create mode 100644 test_a2f_api.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1e8a5fb --- /dev/null +++ b/.gitignore @@ -0,0 +1,8 @@ +*.blend +*.wav +**__pycache__** +.* +*.png +*.jpg +*.jpeg +*.mp4 diff --git a/a2f_api.py b/a2f_api.py new file mode 100644 index 0000000..453d951 --- /dev/null +++ b/a2f_api.py @@ -0,0 +1,216 @@ +import math +import os +import requests +from pprint import pprint +import argparse +import soundfile + + +class A2F(object): + ROOT_PATH = "/home/ubuntu/results" + + BASE_PATH = os.path.expanduser("~/.local/share/ov/pkg/audio2face-2023.2.0/") + ASSETS_PATH = os.path.join(BASE_PATH, "exts/omni.audio2face.tool/deps/audio2face-assets") + CLAIRE_PATH = os.path.join(ASSETS_PATH, "claire/mesh/claire_fullface_model.usd") + MARK_PATH = os.path.join(ASSETS_PATH, "mark/mesh/mark_fullface_model.usd") + + PLAYER_NAME = "/World/audio2face/Player" + FULLFACE_MODEL_NAME = "/World/audio2face/CoreFullface" + + def __init__(self, url="http://localhost:8011/"): + self.url = url + + def status(self): + resp = requests.get(f"{self.url}status") + return resp.json() + + def get_emotion_names(self): + resp = requests.get(f"{self.url}A2F/A2E/GetEmotionNames") + return resp.json().get('result', []) + + def get_scene_objects(self): + resp = requests.get(f"{self.url}A2F/GetInstances") + return resp.json().get('result', {}) + + def get_players(self): + resp = requests.get(f"{self.url}A2F/Player/GetInstances") + return resp.json().get('result', {}) + + def load_usd(self, usd_path): + resp = requests.post(f"{self.url}A2F/USD/Load", + json={ + "file_name": usd_path, + }) + return resp.json() + + def load_claire(self): + print("Claire path: ", self.CLAIRE_PATH) + return self.load_usd(self.CLAIRE_PATH) + + def load_mark(self): + print("Mark path: ", self.MARK_PATH) + return self.load_usd(self.MARK_PATH) + + def openapi(self): + resp = requests.get(f"{self.url}openapi.json") + return resp.json() + + def get_frame(self): + pass + + def get_settings(self): + resp = requests.post(f"{self.url}A2F/GetSettings", json={ + "a2f_instance": "", + }) + return resp.json() + + def get_player_root(self): + resp = requests.post(f"{self.url}A2F/Player/GetRootPath", json={ + "a2f_player": self.PLAYER_NAME, + }) + return resp.json() + + def set_player_root(self, new_path): + resp = requests.post(f"{self.url}A2F/Player/SetRootPath", json={ + "a2f_player": self.PLAYER_NAME, + "dir_path": new_path, + }) + return resp.json() + + def set_audio(self, audio_path): + duration = soundfile.info(audio_path).duration + print("Audio duration: ", duration) + resp = requests.post(f"{self.url}A2F/Player/SetTrack", json={ + "a2f_player": self.PLAYER_NAME, + "file_name": audio_path, + "time_range": [ + 0, + duration + ] + }) + data = [resp.json()] + resp = requests.post(f"{self.url}A2F/Player/SetRange", json={ + "a2f_player": self.PLAYER_NAME, + "start": 0, + "end": duration + }) + data.append(resp.json()) + + resp = requests.post(f"{self.url}A2F/Player/GetTracks", json={ + "a2f_player": self.PLAYER_NAME, + }) + data.append(resp.json()) + + resp = requests.post(f"{self.url}A2F/Player/GetCurrentTrack", json={ + "a2f_player": self.PLAYER_NAME, + }) + data.append(resp.json()) + return data + + def get_audio_range(self): + resp = requests.post(f"{self.url}A2F/Player/GetRange", json={ + "a2f_player": self.PLAYER_NAME, + }) + return resp.json() + + def run(self): + resp = requests.post(f"{self.url}A2F/A2E/GenerateKeys", json={ + "a2f_instance": self.FULLFACE_MODEL_NAME, + }) + return resp.json() + + def get_number_of_keys(self): + resp = requests.post(f"{self.url}A2F/A2E/NumKeys", json={ + "a2f_instance": self.FULLFACE_MODEL_NAME, + }) + return resp.json() + + def get_generated_keys(self): + resp = requests.post(f"{self.url}A2F/A2E/GetKeyData", json={ + "a2f_instance": self.FULLFACE_MODEL_NAME, + }) + return resp.json() + + def get_a2e_settings(self): + resp = requests.post(f"{self.url}A2F/A2E/GetSettings", json={ + "a2f_instance": self.FULLFACE_MODEL_NAME, + }) + return resp.json() + + def get_blendshape_solvers(self): + resp = requests.get(f"{self.url}A2F/Exporter/GetBlendShapeSolvers") + return resp.json() + + def get_pre_settings(self): + resp = requests.post(f"{self.url}A2F/PRE/SetSettings", json={ + "a2f_instance": self.FULLFACE_MODEL_NAME, + "prediction_delay": 0.01, + }) + resp = requests.post(f"{self.url}A2F/PRE/GetSettings", json={ + "a2f_instance": self.FULLFACE_MODEL_NAME, + }) + return resp.json() + + def get_post_settings(self): + resp = requests.post(f"{self.url}A2F/POST/GetSettings", json={ + "a2f_instance": self.FULLFACE_MODEL_NAME, + }) + return resp.json() + + def export(self, export_path, filename): + resp = requests.post(f"{self.url}A2F/Exporter/ExportGeometryCache", json={ + "export_directory": export_path, + "cache_type": "usd", + "xform_keys": False, + "batch": False, + "file_name": filename, + "fps": 0 + }) + try: + return resp.json() + except: + print(resp.content) + + def export_json(self, export_path, filename): + resp = requests.post(f"{self.url}A2F/Exporter/ExportBlendshapes", json={ + "export_directory": export_path, + "format": "json", + "batch": False, + "file_name": filename, + "fps": 0 + }) + try: + return resp.json() + except: + print(resp.content) + + def upload(self, audio_path): + audio_path = os.path.abspath(audio_path) + fname = os.path.basename(audio_path) + os.system(f"cd ./server && ./send_file_to_gui.sh {audio_path} ../results/{fname}") + return os.path.join("/home/ubuntu/results", fname) + + def pull(self, fname): + export_fname = os.path.splitext(fname)[0] + '.usd' + os.system(f"cd ./server && ./send_file_from_gui.sh ../results/{export_fname} ../") + + def apply(self, audio_path): + fname = os.path.basename(audio_path) + print("Status: ", self.status()) + print("EmotionNames: ", self.get_emotion_names()) + print("Scene Objects: ", self.get_scene_objects()) + print("Scene Players: ", self.get_players()) + print("Preprocessing settings: ", self.get_pre_settings()) + print("Postprocessing settings: ", self.get_post_settings()) + print("Setting player root: ", self.set_player_root("/home/ubuntu/results")) + print("Player root: ", self.get_player_root()) + print("Setting audio: ", self.set_audio(os.path.basename(audio_path))) + print("Audio Range: ", self.get_audio_range()) + print("Running: ", self.run()) + print("NumKeys: ", self.get_number_of_keys()) + print("Keys: ", self.get_generated_keys()) + print("Exporting: ", self.export_json("/home/ubuntu/results", + filename=os.path.splitext(fname)[0])) + + def apply_stream(self, audio_path): + pass diff --git a/audio2face.py b/audio2face.py new file mode 100644 index 0000000..91e82e7 --- /dev/null +++ b/audio2face.py @@ -0,0 +1,50 @@ +# speech to Audio2Face module utilizing the gRPC protocal from audio2face_streaming_utils +import io +from pydub import AudioSegment +from scipy.io.wavfile import read +import numpy as np +from audio2face_streaming_utils import push_audio_track + + +class Audio2FaceService: + def __init__(self, sample_rate=44100): + """ + :param sample_rate: sample rate + """ + self.a2f_url = 'localhost:50051' # Set it to the port of your local host + self.sample_rate = 44100 + self.avatar_instance = '/World/audio2face/PlayerStreaming' # Set it to the name of your Audio2Face Streaming Instance + + def tts_to_wav(self, tts_byte, framerate=22050) -> str: + """ + :param tts_byte: tts data in byte + :param framerate: framerate + :return: wav byte + """ + seg = AudioSegment.from_raw(io.BytesIO(tts_byte), sample_width=2, frame_rate=22050, channels=1) + wavIO = io.BytesIO() + seg.export(wavIO, format="wav") + rate, wav = read(io.BytesIO(wavIO.getvalue())) + return wav + + def wav_to_numpy_float32(self, wav_byte) -> float: + """ + :param wav_byte: wav byte + :return: float32 + """ + return wav_byte.astype(np.float32, order='C') / 32768.0 + + def get_tts_numpy_audio(self, audio) -> float: + """ + :param audio: audio from tts_to_wav + :return: float32 of the audio + """ + wav_byte = self.tts_to_wav(audio) + return self.wav_to_numpy_float32(wav_byte) + + def make_avatar_speaks(self, audio) -> None: + """ + :param audio: tts audio + :return: None + """ + push_audio_track(self.a2f_url, self.get_tts_numpy_audio(audio), self.sample_rate, self.avatar_instance) diff --git a/audio2face_pb2.py b/audio2face_pb2.py new file mode 100644 index 0000000..9d14840 --- /dev/null +++ b/audio2face_pb2.py @@ -0,0 +1,502 @@ +# -*- coding: utf-8 -*- +# Generated by the protocol buffer compiler. DO NOT EDIT! +# source: audio2face.proto +"""Generated protocol buffer code.""" +from google.protobuf import descriptor as _descriptor +from google.protobuf import message as _message +from google.protobuf import reflection as _reflection +from google.protobuf import symbol_database as _symbol_database + +# @@protoc_insertion_point(imports) + +_sym_db = _symbol_database.Default() + + +DESCRIPTOR = _descriptor.FileDescriptor( + name="audio2face.proto", + package="nvidia.audio2face", + syntax="proto3", + serialized_options=None, + create_key=_descriptor._internal_create_key, + serialized_pb=b'\n\x10\x61udio2face.proto\x12\x11nvidia.audio2face"{\n\x10PushAudioRequest\x12\x15\n\rinstance_name\x18\x01 \x01(\t\x12\x12\n\nsamplerate\x18\x02 \x01(\x05\x12\x12\n\naudio_data\x18\x03 \x01(\x0c\x12(\n block_until_playback_is_finished\x18\x04 \x01(\x08"5\n\x11PushAudioResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t"\x85\x01\n\x16PushAudioStreamRequest\x12@\n\x0cstart_marker\x18\x01 \x01(\x0b\x32(.nvidia.audio2face.PushAudioRequestStartH\x00\x12\x14\n\naudio_data\x18\x02 \x01(\x0cH\x00\x42\x13\n\x11streaming_request"l\n\x15PushAudioRequestStart\x12\x15\n\rinstance_name\x18\x01 \x01(\t\x12\x12\n\nsamplerate\x18\x02 \x01(\x05\x12(\n block_until_playback_is_finished\x18\x03 \x01(\x08";\n\x17PushAudioStreamResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t2\xd4\x01\n\nAudio2Face\x12X\n\tPushAudio\x12#.nvidia.audio2face.PushAudioRequest\x1a$.nvidia.audio2face.PushAudioResponse"\x00\x12l\n\x0fPushAudioStream\x12).nvidia.audio2face.PushAudioStreamRequest\x1a*.nvidia.audio2face.PushAudioStreamResponse"\x00(\x01\x62\x06proto3', +) + + +_PUSHAUDIOREQUEST = _descriptor.Descriptor( + name="PushAudioRequest", + full_name="nvidia.audio2face.PushAudioRequest", + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name="instance_name", + full_name="nvidia.audio2face.PushAudioRequest.instance_name", + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=b"".decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + create_key=_descriptor._internal_create_key, + ), + _descriptor.FieldDescriptor( + name="samplerate", + full_name="nvidia.audio2face.PushAudioRequest.samplerate", + index=1, + number=2, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + create_key=_descriptor._internal_create_key, + ), + _descriptor.FieldDescriptor( + name="audio_data", + full_name="nvidia.audio2face.PushAudioRequest.audio_data", + index=2, + number=3, + type=12, + cpp_type=9, + label=1, + has_default_value=False, + default_value=b"", + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + create_key=_descriptor._internal_create_key, + ), + _descriptor.FieldDescriptor( + name="block_until_playback_is_finished", + full_name="nvidia.audio2face.PushAudioRequest.block_until_playback_is_finished", + index=3, + number=4, + type=8, + cpp_type=7, + label=1, + has_default_value=False, + default_value=False, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + create_key=_descriptor._internal_create_key, + ), + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=False, + syntax="proto3", + extension_ranges=[], + oneofs=[], + serialized_start=39, + serialized_end=162, +) + + +_PUSHAUDIORESPONSE = _descriptor.Descriptor( + name="PushAudioResponse", + full_name="nvidia.audio2face.PushAudioResponse", + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name="success", + full_name="nvidia.audio2face.PushAudioResponse.success", + index=0, + number=1, + type=8, + cpp_type=7, + label=1, + has_default_value=False, + default_value=False, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + create_key=_descriptor._internal_create_key, + ), + _descriptor.FieldDescriptor( + name="message", + full_name="nvidia.audio2face.PushAudioResponse.message", + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=b"".decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + create_key=_descriptor._internal_create_key, + ), + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=False, + syntax="proto3", + extension_ranges=[], + oneofs=[], + serialized_start=164, + serialized_end=217, +) + + +_PUSHAUDIOSTREAMREQUEST = _descriptor.Descriptor( + name="PushAudioStreamRequest", + full_name="nvidia.audio2face.PushAudioStreamRequest", + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name="start_marker", + full_name="nvidia.audio2face.PushAudioStreamRequest.start_marker", + index=0, + number=1, + type=11, + cpp_type=10, + label=1, + has_default_value=False, + default_value=None, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + create_key=_descriptor._internal_create_key, + ), + _descriptor.FieldDescriptor( + name="audio_data", + full_name="nvidia.audio2face.PushAudioStreamRequest.audio_data", + index=1, + number=2, + type=12, + cpp_type=9, + label=1, + has_default_value=False, + default_value=b"", + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + create_key=_descriptor._internal_create_key, + ), + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=False, + syntax="proto3", + extension_ranges=[], + oneofs=[ + _descriptor.OneofDescriptor( + name="streaming_request", + full_name="nvidia.audio2face.PushAudioStreamRequest.streaming_request", + index=0, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[], + ) + ], + serialized_start=220, + serialized_end=353, +) + + +_PUSHAUDIOREQUESTSTART = _descriptor.Descriptor( + name="PushAudioRequestStart", + full_name="nvidia.audio2face.PushAudioRequestStart", + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name="instance_name", + full_name="nvidia.audio2face.PushAudioRequestStart.instance_name", + index=0, + number=1, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=b"".decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + create_key=_descriptor._internal_create_key, + ), + _descriptor.FieldDescriptor( + name="samplerate", + full_name="nvidia.audio2face.PushAudioRequestStart.samplerate", + index=1, + number=2, + type=5, + cpp_type=1, + label=1, + has_default_value=False, + default_value=0, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + create_key=_descriptor._internal_create_key, + ), + _descriptor.FieldDescriptor( + name="block_until_playback_is_finished", + full_name="nvidia.audio2face.PushAudioRequestStart.block_until_playback_is_finished", + index=2, + number=3, + type=8, + cpp_type=7, + label=1, + has_default_value=False, + default_value=False, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + create_key=_descriptor._internal_create_key, + ), + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=False, + syntax="proto3", + extension_ranges=[], + oneofs=[], + serialized_start=355, + serialized_end=463, +) + + +_PUSHAUDIOSTREAMRESPONSE = _descriptor.Descriptor( + name="PushAudioStreamResponse", + full_name="nvidia.audio2face.PushAudioStreamResponse", + filename=None, + file=DESCRIPTOR, + containing_type=None, + create_key=_descriptor._internal_create_key, + fields=[ + _descriptor.FieldDescriptor( + name="success", + full_name="nvidia.audio2face.PushAudioStreamResponse.success", + index=0, + number=1, + type=8, + cpp_type=7, + label=1, + has_default_value=False, + default_value=False, + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + create_key=_descriptor._internal_create_key, + ), + _descriptor.FieldDescriptor( + name="message", + full_name="nvidia.audio2face.PushAudioStreamResponse.message", + index=1, + number=2, + type=9, + cpp_type=9, + label=1, + has_default_value=False, + default_value=b"".decode("utf-8"), + message_type=None, + enum_type=None, + containing_type=None, + is_extension=False, + extension_scope=None, + serialized_options=None, + file=DESCRIPTOR, + create_key=_descriptor._internal_create_key, + ), + ], + extensions=[], + nested_types=[], + enum_types=[], + serialized_options=None, + is_extendable=False, + syntax="proto3", + extension_ranges=[], + oneofs=[], + serialized_start=465, + serialized_end=524, +) + +_PUSHAUDIOSTREAMREQUEST.fields_by_name["start_marker"].message_type = _PUSHAUDIOREQUESTSTART +_PUSHAUDIOSTREAMREQUEST.oneofs_by_name["streaming_request"].fields.append( + _PUSHAUDIOSTREAMREQUEST.fields_by_name["start_marker"] +) +_PUSHAUDIOSTREAMREQUEST.fields_by_name["start_marker"].containing_oneof = _PUSHAUDIOSTREAMREQUEST.oneofs_by_name[ + "streaming_request" +] +_PUSHAUDIOSTREAMREQUEST.oneofs_by_name["streaming_request"].fields.append( + _PUSHAUDIOSTREAMREQUEST.fields_by_name["audio_data"] +) +_PUSHAUDIOSTREAMREQUEST.fields_by_name["audio_data"].containing_oneof = _PUSHAUDIOSTREAMREQUEST.oneofs_by_name[ + "streaming_request" +] +DESCRIPTOR.message_types_by_name["PushAudioRequest"] = _PUSHAUDIOREQUEST +DESCRIPTOR.message_types_by_name["PushAudioResponse"] = _PUSHAUDIORESPONSE +DESCRIPTOR.message_types_by_name["PushAudioStreamRequest"] = _PUSHAUDIOSTREAMREQUEST +DESCRIPTOR.message_types_by_name["PushAudioRequestStart"] = _PUSHAUDIOREQUESTSTART +DESCRIPTOR.message_types_by_name["PushAudioStreamResponse"] = _PUSHAUDIOSTREAMRESPONSE +_sym_db.RegisterFileDescriptor(DESCRIPTOR) + +PushAudioRequest = _reflection.GeneratedProtocolMessageType( + "PushAudioRequest", + (_message.Message,), + { + "DESCRIPTOR": _PUSHAUDIOREQUEST, + "__module__": "audio2face_pb2" + # @@protoc_insertion_point(class_scope:nvidia.audio2face.PushAudioRequest) + }, +) +_sym_db.RegisterMessage(PushAudioRequest) + +PushAudioResponse = _reflection.GeneratedProtocolMessageType( + "PushAudioResponse", + (_message.Message,), + { + "DESCRIPTOR": _PUSHAUDIORESPONSE, + "__module__": "audio2face_pb2" + # @@protoc_insertion_point(class_scope:nvidia.audio2face.PushAudioResponse) + }, +) +_sym_db.RegisterMessage(PushAudioResponse) + +PushAudioStreamRequest = _reflection.GeneratedProtocolMessageType( + "PushAudioStreamRequest", + (_message.Message,), + { + "DESCRIPTOR": _PUSHAUDIOSTREAMREQUEST, + "__module__": "audio2face_pb2" + # @@protoc_insertion_point(class_scope:nvidia.audio2face.PushAudioStreamRequest) + }, +) +_sym_db.RegisterMessage(PushAudioStreamRequest) + +PushAudioRequestStart = _reflection.GeneratedProtocolMessageType( + "PushAudioRequestStart", + (_message.Message,), + { + "DESCRIPTOR": _PUSHAUDIOREQUESTSTART, + "__module__": "audio2face_pb2" + # @@protoc_insertion_point(class_scope:nvidia.audio2face.PushAudioRequestStart) + }, +) +_sym_db.RegisterMessage(PushAudioRequestStart) + +PushAudioStreamResponse = _reflection.GeneratedProtocolMessageType( + "PushAudioStreamResponse", + (_message.Message,), + { + "DESCRIPTOR": _PUSHAUDIOSTREAMRESPONSE, + "__module__": "audio2face_pb2" + # @@protoc_insertion_point(class_scope:nvidia.audio2face.PushAudioStreamResponse) + }, +) +_sym_db.RegisterMessage(PushAudioStreamResponse) + + +_AUDIO2FACE = _descriptor.ServiceDescriptor( + name="Audio2Face", + full_name="nvidia.audio2face.Audio2Face", + file=DESCRIPTOR, + index=0, + serialized_options=None, + create_key=_descriptor._internal_create_key, + serialized_start=527, + serialized_end=739, + methods=[ + _descriptor.MethodDescriptor( + name="PushAudio", + full_name="nvidia.audio2face.Audio2Face.PushAudio", + index=0, + containing_service=None, + input_type=_PUSHAUDIOREQUEST, + output_type=_PUSHAUDIORESPONSE, + serialized_options=None, + create_key=_descriptor._internal_create_key, + ), + _descriptor.MethodDescriptor( + name="PushAudioStream", + full_name="nvidia.audio2face.Audio2Face.PushAudioStream", + index=1, + containing_service=None, + input_type=_PUSHAUDIOSTREAMREQUEST, + output_type=_PUSHAUDIOSTREAMRESPONSE, + serialized_options=None, + create_key=_descriptor._internal_create_key, + ), + ], +) +_sym_db.RegisterServiceDescriptor(_AUDIO2FACE) + +DESCRIPTOR.services_by_name["Audio2Face"] = _AUDIO2FACE + +# @@protoc_insertion_point(module_scope) diff --git a/audio2face_pb2_grpc.py b/audio2face_pb2_grpc.py new file mode 100644 index 0000000..9ed1975 --- /dev/null +++ b/audio2face_pb2_grpc.py @@ -0,0 +1,122 @@ +# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT! +"""Client and server classes corresponding to protobuf-defined services.""" +import grpc + +import audio2face_pb2 as audio2face__pb2 + + +class Audio2FaceStub(object): + """Missing associated documentation comment in .proto file.""" + + def __init__(self, channel): + """Constructor. + + Args: + channel: A grpc.Channel. + """ + self.PushAudio = channel.unary_unary( + "/nvidia.audio2face.Audio2Face/PushAudio", + request_serializer=audio2face__pb2.PushAudioRequest.SerializeToString, + response_deserializer=audio2face__pb2.PushAudioResponse.FromString, + ) + self.PushAudioStream = channel.stream_unary( + "/nvidia.audio2face.Audio2Face/PushAudioStream", + request_serializer=audio2face__pb2.PushAudioStreamRequest.SerializeToString, + response_deserializer=audio2face__pb2.PushAudioStreamResponse.FromString, + ) + + +class Audio2FaceServicer(object): + """Missing associated documentation comment in .proto file.""" + + def PushAudio(self, request, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + def PushAudioStream(self, request_iterator, context): + """Missing associated documentation comment in .proto file.""" + context.set_code(grpc.StatusCode.UNIMPLEMENTED) + context.set_details("Method not implemented!") + raise NotImplementedError("Method not implemented!") + + +def add_Audio2FaceServicer_to_server(servicer, server): + rpc_method_handlers = { + "PushAudio": grpc.unary_unary_rpc_method_handler( + servicer.PushAudio, + request_deserializer=audio2face__pb2.PushAudioRequest.FromString, + response_serializer=audio2face__pb2.PushAudioResponse.SerializeToString, + ), + "PushAudioStream": grpc.stream_unary_rpc_method_handler( + servicer.PushAudioStream, + request_deserializer=audio2face__pb2.PushAudioStreamRequest.FromString, + response_serializer=audio2face__pb2.PushAudioStreamResponse.SerializeToString, + ), + } + generic_handler = grpc.method_handlers_generic_handler("nvidia.audio2face.Audio2Face", rpc_method_handlers) + server.add_generic_rpc_handlers((generic_handler,)) + + +# This class is part of an EXPERIMENTAL API. +class Audio2Face(object): + """Missing associated documentation comment in .proto file.""" + + @staticmethod + def PushAudio( + request, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.unary_unary( + request, + target, + "/nvidia.audio2face.Audio2Face/PushAudio", + audio2face__pb2.PushAudioRequest.SerializeToString, + audio2face__pb2.PushAudioResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) + + @staticmethod + def PushAudioStream( + request_iterator, + target, + options=(), + channel_credentials=None, + call_credentials=None, + insecure=False, + compression=None, + wait_for_ready=None, + timeout=None, + metadata=None, + ): + return grpc.experimental.stream_unary( + request_iterator, + target, + "/nvidia.audio2face.Audio2Face/PushAudioStream", + audio2face__pb2.PushAudioStreamRequest.SerializeToString, + audio2face__pb2.PushAudioStreamResponse.FromString, + options, + channel_credentials, + insecure, + call_credentials, + compression, + wait_for_ready, + timeout, + metadata, + ) diff --git a/audio2face_streaming_utils.py b/audio2face_streaming_utils.py new file mode 100644 index 0000000..a65aba2 --- /dev/null +++ b/audio2face_streaming_utils.py @@ -0,0 +1,142 @@ +""" +This demo script shows how to send audio data to Audio2Face Streaming Audio Player via gRPC requests. +There are two options: + * Send the whole track at once using PushAudioRequest() + * Send the audio chunks seuqntially in a stream using PushAudioStreamRequest() +For the second option this script emulates the stream of chunks, generated by splitting an input WAV audio file. +But in a real application such stream of chunks may be aquired from some other streaming source: + * streaming audio via internet, streaming Text-To-Speech, etc +gRPC protocol details could be find in audio2face.proto +""" + +import sys +import grpc +import time +import numpy as np +import soundfile + +import audio2face_pb2 +import audio2face_pb2_grpc + + +def push_audio_track(url, audio_data, samplerate, instance_name): + """ + This function pushes the whole audio track at once via PushAudioRequest() + PushAudioRequest parameters: + * audio_data: bytes, containing audio data for the whole track, where each sample is encoded as 4 bytes (float32) + * samplerate: sampling rate for the audio data + * instance_name: prim path of the Audio2Face Streaming Audio Player on the stage, were to push the audio data + * block_until_playback_is_finished: if True, the gRPC request will be blocked until the playback of the pushed track is finished + The request is passed to PushAudio() + """ + + block_until_playback_is_finished = True # ADJUST + with grpc.insecure_channel(url) as channel: + stub = audio2face_pb2_grpc.Audio2FaceStub(channel) + request = audio2face_pb2.PushAudioRequest() + request.audio_data = audio_data.astype(np.float32).tobytes() + request.samplerate = samplerate + request.instance_name = instance_name + request.block_until_playback_is_finished = block_until_playback_is_finished + print("Sending audio data...") + response = stub.PushAudio(request) + if response.success: + print("SUCCESS") + else: + print(f"ERROR: {response.message}") + print("Closed channel") + + +def push_audio_track_stream(url, audio_data, samplerate, instance_name): + """ + This function pushes audio chunks sequentially via PushAudioStreamRequest() + The function emulates the stream of chunks, generated by splitting input audio track. + But in a real application such stream of chunks may be aquired from some other streaming source. + The first message must contain start_marker field, containing only meta information (without audio data): + * samplerate: sampling rate for the audio data + * instance_name: prim path of the Audio2Face Streaming Audio Player on the stage, were to push the audio data + * block_until_playback_is_finished: if True, the gRPC request will be blocked until the playback of the pushed track is finished (after the last message) + Second and other messages must contain audio_data field: + * audio_data: bytes, containing audio data for an audio chunk, where each sample is encoded as 4 bytes (float32) + All messages are packed into a Python generator and passed to PushAudioStream() + """ + + chunk_size = samplerate // 10 # ADJUST + sleep_between_chunks = 0.04 # ADJUST + block_until_playback_is_finished = True # ADJUST + + with grpc.insecure_channel(url) as channel: + print("Channel creadted") + stub = audio2face_pb2_grpc.Audio2FaceStub(channel) + + def make_generator(): + start_marker = audio2face_pb2.PushAudioRequestStart( + samplerate=samplerate, + instance_name=instance_name, + block_until_playback_is_finished=block_until_playback_is_finished, + ) + # At first, we send a message with start_marker + yield audio2face_pb2.PushAudioStreamRequest(start_marker=start_marker) + # Then we send messages with audio_data + for i in range(len(audio_data) // chunk_size + 1): + time.sleep(sleep_between_chunks) + chunk = audio_data[i * chunk_size : i * chunk_size + chunk_size] + yield audio2face_pb2.PushAudioStreamRequest(audio_data=chunk.astype(np.float32).tobytes()) + + request_generator = make_generator() + print("Sending audio data...") + response = stub.PushAudioStream(request_generator) + if response.success: + print("SUCCESS") + else: + print(f"ERROR: {response.message}") + print("Channel closed") + + +def main(): + """ + This demo script shows how to send audio data to Audio2Face Streaming Audio Player via gRPC requests. + There two options: + * Send the whole track at once using PushAudioRequest() + * Send the audio chunks seuqntially in a stream using PushAudioStreamRequest() + For the second option this script emulates the stream of chunks, generated by splitting an input WAV audio file. + But in a real application such stream of chunks may be aquired from some other streaming source: + * streaming audio via internet, streaming Text-To-Speech, etc + gRPC protocol details could be find in audio2face.proto + """ + + if len(sys.argv) < 3: + print("Format: python test_client.py PATH_TO_WAV INSTANCE_NAME") + return + + # Sleep time emulates long latency of the request + sleep_time = 2.0 # ADJUST + + # URL of the Audio2Face Streaming Audio Player server (where A2F App is running) + url = "localhost:50051" # ADJUST + + # Local input WAV file path + audio_fpath = sys.argv[1] + + # Prim path of the Audio2Face Streaming Audio Player on the stage (were to push the audio data) + instance_name = sys.argv[2] + + data, samplerate = soundfile.read(audio_fpath, dtype="float32") + + # Only Mono audio is supported + if len(data.shape) > 1: + data = np.average(data, axis=1) + + print(f"Sleeping for {sleep_time} seconds") + time.sleep(sleep_time) + + if 0: # ADJUST + # Push the whole audio track at once + push_audio_track(url, data, samplerate, instance_name) + else: + # Emulate audio stream and push audio chunks sequentially + push_audio_track_stream(url, data, samplerate, instance_name) + + +if __name__ == "__main__": + main() diff --git a/emotalk_own/Dockerfile b/emotalk_own/Dockerfile new file mode 100644 index 0000000..dedbee3 --- /dev/null +++ b/emotalk_own/Dockerfile @@ -0,0 +1,54 @@ +FROM nvidia/cudagl:11.3.1-devel-ubuntu20.04 +MAINTAINER "Jungwoo Choi" + +ARG DEBIAN_FRONTEND=noninteractive +ENV TZ=Asia/Seoul + +ADD requirements.txt /tmp/requirements.txt +RUN \ + # Fix CUDA apt error + rm -f /etc/apt/sources.list.d/cuda.list && \ + rm -f /etc/apt/sources.list.d/nvidia-ml.list && \ + apt-get update && apt-get install -y gnupg2 software-properties-common && \ + apt-key del 7fa2af80 && \ + apt-get update && apt-get install -y --no-install-recommends wget && \ + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \ + dpkg -i cuda-keyring_1.0-1_all.deb && \ + # Install Start + apt update && \ + add-apt-repository -y ppa:savoury1/ffmpeg4 && \ + apt -y install python3.8 python3.8-distutils libgl1-mesa-glx libglib2.0-0 git wget zsh vim openssh-server curl ffmpeg && \ + # Python Library + update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \ + wget https://bootstrap.pypa.io/get-pip.py && \ + python3 get-pip.py && \ + pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113 && \ + pip install -r /tmp/requirements.txt && \ + # zsh option + chsh -s /bin/zsh && \ + sh -c "$(wget -O- https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" && \ + # add zsh-autosuggestions, zsh-syntax-highlighting plugin + git clone https://github.com/zsh-users/zsh-autosuggestions ~/.oh-my-zsh/custom/plugins/zsh-autosuggestions && \ + git clone https://github.com/zsh-users/zsh-syntax-highlighting.git ~/.oh-my-zsh/custom/plugins/zsh-syntax-highlighting && \ + # Modify .zshrc whth Perl + perl -pi -w -e 's/ZSH_THEME=.*/ZSH_THEME="af-magic"/g;' ~/.zshrc && \ + perl -pi -w -e 's/plugins=.*/plugins=(git ssh-agent zsh-autosuggestions zsh-syntax-highlighting)/g;' ~/.zshrc && \ + # Set ssh id and password, default is id = root, password = root. + # I recommand changing this for more security + # PermitRootLogin : yes - for ssh connection + echo 'root:root' |chpasswd && \ + sed -ri 's/^#?PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && \ + sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config && \ + mkdir /root/.ssh && \ + mkdir /var/run/sshd && \ + # install language pack for timeline issue. + apt-get install -y language-pack-en && update-locale && \ + # Clean up + apt-get clean && \ + apt-get autoclean && \ + apt-get autoremove -y && \ + rm -rf /var/lib/cache/* && \ + rm -rf /var/lib/log/* + +WORKDIR /workspace +CMD ["echo", "nvidia/cudagl:11.3.1-devel-ubuntu20.04 is ready!", 'zsh'] diff --git a/emotalk_own/LICENSE b/emotalk_own/LICENSE new file mode 100644 index 0000000..53339cf --- /dev/null +++ b/emotalk_own/LICENSE @@ -0,0 +1,13 @@ +Copyright (c) 2023 Psyche AI Inc. + +This work is licensed under the Creative Commons Attribution-NonCommercial 4.0 International License (CC BY-NC 4.0). To view a copy of this license, visit http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, and distribute the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: + +1. Attribution — You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use. + +2. NonCommercial — You may not use the material for commercial purposes. + +3. No additional restrictions — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. diff --git a/emotalk_own/blender.sh b/emotalk_own/blender.sh new file mode 100755 index 0000000..cbfe651 --- /dev/null +++ b/emotalk_own/blender.sh @@ -0,0 +1,4 @@ +wget https://ftp.nluug.nl/pub/graphics/blender/release/Blender3.4/blender-3.4.1-linux-x64.tar.xz +tar -xf blender-3.4.1-linux-x64.tar.xz +mv blender-3.4.1-linux-x64 blender && rm blender-3.4.1-linux-x64.tar.xz + diff --git a/emotalk_own/demo.py b/emotalk_own/demo.py new file mode 100644 index 0000000..35eee1b --- /dev/null +++ b/emotalk_own/demo.py @@ -0,0 +1,111 @@ +import librosa +import numpy as np +import argparse +from scipy.signal import savgol_filter +import torch +from model import EmoTalk +import random +import os, subprocess +import shlex + + +@torch.no_grad() +def test(args): + result_path = args.result_path + os.makedirs(result_path, exist_ok=True) + eye1 = np.array([0.36537236, 0.950235724, 0.95593375, 0.916715622, 0.367256105, 0.119113259, 0.025357503]) + eye2 = np.array([0.234776169, 0.909951985, 0.944758058, 0.777862132, 0.191071674, 0.235437036, 0.089163929]) + eye3 = np.array([0.870040774, 0.949833691, 0.949418545, 0.695911646, 0.191071674, 0.072576277, 0.007108896]) + eye4 = np.array([0.000307991, 0.556701422, 0.952656746, 0.942345619, 0.425857186, 0.148335218, 0.017659493]) + model = EmoTalk(args) + model.load_state_dict(torch.load(args.model_path, map_location=torch.device(args.device)), strict=False) + model = model.to(args.device) + model.eval() + wav_path = args.wav_path + file_name = wav_path.split('/')[-1].split('.')[0] + speech_array, sampling_rate = librosa.load(os.path.join(wav_path), sr=16000) + audio = torch.FloatTensor(speech_array).unsqueeze(0).to(args.device) + level = torch.tensor([1]).to(args.device) + person = torch.tensor([0]).to(args.device) + prediction = model.predict(audio, level, person) + prediction = prediction.squeeze().detach().cpu().numpy() + if args.post_processing: + output = np.zeros((prediction.shape[0], prediction.shape[1])) + for i in range(prediction.shape[1]): + output[:, i] = savgol_filter(prediction[:, i], 5, 2) + output[:, 8] = 0 + output[:, 9] = 0 + i = random.randint(0, 60) + while i < output.shape[0] - 7: + eye_num = random.randint(1, 4) + if eye_num == 1: + output[i:i + 7, 8] = eye1 + output[i:i + 7, 9] = eye1 + elif eye_num == 2: + output[i:i + 7, 8] = eye2 + output[i:i + 7, 9] = eye2 + elif eye_num == 3: + output[i:i + 7, 8] = eye3 + output[i:i + 7, 9] = eye3 + else: + output[i:i + 7, 8] = eye4 + output[i:i + 7, 9] = eye4 + time1 = random.randint(60, 180) + i = i + time1 + np.save(os.path.join(result_path, "{}.npy".format(file_name)), output) # with postprocessing (smoothing and blinking) + else: + np.save(os.path.join(result_path, "{}.npy".format(file_name)), prediction) # without post-processing + + +def render_video(args): + wav_name = args.wav_path.split('/')[-1].split('.')[0] + image_path = os.path.join(args.result_path, wav_name) + os.makedirs(image_path, exist_ok=True) + image_temp = image_path + "/%d.png" + output_path = os.path.join(args.result_path, wav_name + ".mp4") + blender_path = args.blender_path + python_path = "./render.py" + blend_path = "./render.blend" + cmd = '{} -t 64 -b {} -P {} -- "{}" "{}" '.format(blender_path, blend_path, python_path, args.result_path, wav_name) + cmd = shlex.split(cmd) + p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + while p.poll() is None: + line = p.stdout.readline() + line = line.strip() + if line: + print('[{}]'.format(line)) + if p.returncode == 0: + print('Subprogram success') + else: + print('Subprogram failed') + + cmd = 'ffmpeg -r 30 -i "{}" -i "{}" -pix_fmt yuv420p -s 512x768 "{}" -y'.format(image_temp, args.wav_path, output_path) + subprocess.call(cmd, shell=True) + + cmd = 'rm -rf "{}"'.format(image_path) + subprocess.call(cmd, shell=True) + + +def main(): + parser = argparse.ArgumentParser( + description='EmoTalk: Speech-driven Emotional Disentanglement for 3D Face Animation') + parser.add_argument("--wav_path", type=str, default="./audio/angry1.wav", help='path of the test data') + parser.add_argument("--bs_dim", type=int, default=52, help='number of blendshapes:52') + parser.add_argument("--feature_dim", type=int, default=832, help='number of feature dim') + parser.add_argument("--period", type=int, default=30, help='number of period') + parser.add_argument("--device", type=str, default="cuda", help='device') + parser.add_argument("--model_path", type=str, default="./pretrain_model/EmoTalk.pth", + help='path of the trained models') + parser.add_argument("--result_path", type=str, default="./result/", help='path of the result') + parser.add_argument("--max_seq_len", type=int, default=5000, help='max sequence length') + parser.add_argument("--num_workers", type=int, default=0) + parser.add_argument("--batch_size", type=int, default=1) + parser.add_argument("--post_processing", type=bool, default=True, help='whether to use post processing') + parser.add_argument("--blender_path", type=str, default="./blender/blender", help='path of blender') + args = parser.parse_args() + test(args) + render_video(args) + + +if __name__ == "__main__": + main() diff --git a/emotalk_own/model.py b/emotalk_own/model.py new file mode 100644 index 0000000..2aab764 --- /dev/null +++ b/emotalk_own/model.py @@ -0,0 +1,144 @@ +import torch +import torch.nn as nn +import numpy as np +import math +from transformers import Wav2Vec2Processor, Wav2Vec2FeatureExtractor +from wav2vec import Wav2Vec2Model, Wav2Vec2ForSpeechClassification +from utils import init_biased_mask, enc_dec_mask + + +class EmoTalk(nn.Module): + def __init__(self, args): + super(EmoTalk, self).__init__() + self.feature_dim = args.feature_dim + self.bs_dim = args.bs_dim + self.device = args.device + self.batch_size = args.batch_size + self.audio_encoder_cont = Wav2Vec2Model.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english") + self.processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english") + self.audio_encoder_cont.feature_extractor._freeze_parameters() + self.audio_encoder_emo = Wav2Vec2ForSpeechClassification.from_pretrained( + "r-f/wav2vec-english-speech-emotion-recognition") + self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained( + "r-f/wav2vec-english-speech-emotion-recognition") + self.audio_encoder_emo.wav2vec2.feature_extractor._freeze_parameters() + self.max_seq_len = args.max_seq_len + self.audio_feature_map_cont = nn.Linear(1024, 512) + self.audio_feature_map_emo = nn.Linear(1024, 832) + self.audio_feature_map_emo2 = nn.Linear(832, 256) + self.relu = nn.ReLU() + self.biased_mask1 = init_biased_mask(n_head=4, max_seq_len=args.max_seq_len, period=args.period) + self.one_hot_level = np.eye(2) + self.obj_vector_level = nn.Linear(2, 32) + self.one_hot_person = np.eye(24) + self.obj_vector_person = nn.Linear(24, 32) + decoder_layer = nn.TransformerDecoderLayer(d_model=args.feature_dim, nhead=4, dim_feedforward=args.feature_dim, + batch_first=True) + self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=1) + self.bs_map_r = nn.Linear(self.feature_dim, self.bs_dim) + nn.init.constant_(self.bs_map_r.weight, 0) + nn.init.constant_(self.bs_map_r.bias, 0) + + def forward(self, data): + frame_num11 = data["target11"].shape[1] + frame_num12 = data["target12"].shape[1] + inputs12 = self.processor(torch.squeeze(data["input12"]), sampling_rate=16000, return_tensors="pt", + padding="longest").input_values.to(self.device) + hidden_states_cont1 = self.audio_encoder_cont(inputs12, frame_num=frame_num11).last_hidden_state + hidden_states_cont12 = self.audio_encoder_cont(inputs12, frame_num=frame_num12).last_hidden_state + inputs21 = self.feature_extractor(torch.squeeze(data["input21"]), sampling_rate=16000, padding=True, + return_tensors="pt").input_values.to(self.device) + inputs12 = self.feature_extractor(torch.squeeze(data["input12"]), sampling_rate=16000, padding=True, + return_tensors="pt").input_values.to(self.device) + + output_emo1 = self.audio_encoder_emo(inputs21, frame_num=frame_num11) + output_emo2 = self.audio_encoder_emo(inputs12, frame_num=frame_num12) + + hidden_states_emo1 = output_emo1.hidden_states + hidden_states_emo2 = output_emo2.hidden_states + + label1 = output_emo1.logits + onehot_level = self.one_hot_level[data["level"]] + onehot_level = torch.from_numpy(onehot_level).to(self.device).float() + onehot_person = self.one_hot_person[data["person"]] + onehot_person = torch.from_numpy(onehot_person).to(self.device).float() + if data["target11"].shape[0] == 1: + obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0) + obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0) + else: + obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0).permute(1, 0, 2) + obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0).permute(1, 0, 2) + + obj_embedding_level11 = obj_embedding_level.repeat(1, frame_num11, 1) + obj_embedding_level12 = obj_embedding_level.repeat(1, frame_num12, 1) + obj_embedding_person11 = obj_embedding_person.repeat(1, frame_num11, 1) + obj_embedding_person12 = obj_embedding_person.repeat(1, frame_num12, 1) + hidden_states_cont1 = self.audio_feature_map_cont(hidden_states_cont1) + hidden_states_emo11_832 = self.audio_feature_map_emo(hidden_states_emo1) + hidden_states_emo11_256 = self.relu(self.audio_feature_map_emo2(hidden_states_emo11_832)) + + hidden_states11 = torch.cat( + [hidden_states_cont1, hidden_states_emo11_256, obj_embedding_level11, obj_embedding_person11], dim=2) + hidden_states_cont12 = self.audio_feature_map_cont(hidden_states_cont12) + hidden_states_emo12_832 = self.audio_feature_map_emo(hidden_states_emo2) + hidden_states_emo12_256 = self.relu(self.audio_feature_map_emo2(hidden_states_emo12_832)) + + hidden_states12 = torch.cat( + [hidden_states_cont12, hidden_states_emo12_256, obj_embedding_level12, obj_embedding_person12], dim=2) + if data["target11"].shape[0] == 1: + tgt_mask11 = self.biased_mask1[:, :hidden_states11.shape[1], :hidden_states11.shape[1]].clone().detach().to( + device=self.device) + tgt_mask22 = self.biased_mask1[:, :hidden_states12.shape[1], :hidden_states12.shape[1]].clone().detach().to( + device=self.device) + + memory_mask11 = enc_dec_mask(self.device, hidden_states11.shape[1], hidden_states11.shape[1]) + memory_mask12 = enc_dec_mask(self.device, hidden_states12.shape[1], hidden_states12.shape[1]) + bs_out11 = self.transformer_decoder(hidden_states11, hidden_states_emo11_832, tgt_mask=tgt_mask11, + memory_mask=memory_mask11) + bs_out12 = self.transformer_decoder(hidden_states12, hidden_states_emo12_832, tgt_mask=tgt_mask22, + memory_mask=memory_mask12) + bs_output11 = self.bs_map_r(bs_out11) + bs_output12 = self.bs_map_r(bs_out12) + + return bs_output11, bs_output12, label1 + + def predict(self, audio, level, person): + frame_num11 = math.ceil(audio.shape[1] / 16000 * 30) + inputs12 = self.processor(torch.squeeze(audio), sampling_rate=16000, return_tensors="pt", + padding="longest").input_values.to(self.device) + hidden_states_cont1 = self.audio_encoder_cont(inputs12, frame_num=frame_num11).last_hidden_state + inputs12 = self.feature_extractor(torch.squeeze(audio), sampling_rate=16000, padding=True, + return_tensors="pt").input_values.to(self.device) + output_emo1 = self.audio_encoder_emo(inputs12, frame_num=frame_num11) + hidden_states_emo1 = output_emo1.hidden_states + + onehot_level = self.one_hot_level[level] + onehot_level = torch.from_numpy(onehot_level).to(self.device).float() + onehot_person = self.one_hot_person[person] + onehot_person = torch.from_numpy(onehot_person).to(self.device).float() + if audio.shape[0] == 1: + obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0) + obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0) + else: + obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0).permute(1, 0, 2) + obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0).permute(1, 0, 2) + + obj_embedding_level11 = obj_embedding_level.repeat(1, frame_num11, 1) + obj_embedding_person11 = obj_embedding_person.repeat(1, frame_num11, 1) + hidden_states_cont1 = self.audio_feature_map_cont(hidden_states_cont1) + hidden_states_emo11_832 = self.audio_feature_map_emo(hidden_states_emo1) + hidden_states_emo11_256 = self.relu( + self.audio_feature_map_emo2(hidden_states_emo11_832)) + + hidden_states11 = torch.cat( + [hidden_states_cont1, hidden_states_emo11_256, obj_embedding_level11, obj_embedding_person11], dim=2) + if audio.shape[0] == 1: + tgt_mask11 = self.biased_mask1[:, :hidden_states11.shape[1], + :hidden_states11.shape[1]].clone().detach().to(device=self.device) + + memory_mask11 = enc_dec_mask(self.device, hidden_states11.shape[1], hidden_states11.shape[1]) + bs_out11 = self.transformer_decoder(hidden_states11, hidden_states_emo11_832, tgt_mask=tgt_mask11, + memory_mask=memory_mask11) + bs_output11 = self.bs_map_r(bs_out11) + + return bs_output11 diff --git a/emotalk_own/readme.md b/emotalk_own/readme.md new file mode 100644 index 0000000..68a5350 --- /dev/null +++ b/emotalk_own/readme.md @@ -0,0 +1,103 @@ +![Psyche AI Inc release](./media/psy_logo.png) + +# EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation [ICCV2023] + +Official PyTorch implementation for the paper: + +> **EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation**, ***ICCV 2023***. +> +> Ziqiao Peng, Haoyu Wu, Zhenbo Song, Hao Xu, Xiangyu Zhu, Jun He, Hongyan Liu, Zhaoxin Fan +> +> [Arxiv](https://arxiv.org/abs/2303.11089) | [Project Page](https://ziqiaopeng.github.io/emotalk/) | [License](https://github.com/psyai-net/EmoTalk_release/blob/main/LICENSE) + + + +

+ +

+ +> Given audio input expressing different emotions, EmoTalk produces realistic 3D facial animation sequences with corresponding emotional expressions as outputs. +## News +- `2023.10.17` Thanks to [noirmist](https://github.com/noirmist)! Now you can create the environment via docker. +## Environment + +- Linux +- Python 3.8.8 +- Pytorch 1.12.1 +- CUDA 11.3 +- Blender 3.4.1 +- ffmpeg 4.4.1 + +Clone the repo: + ```bash + git clone https://github.com/psyai-net/EmoTalk_release.git + cd EmoTalk_release + ``` +Create conda environment: +```bash +conda create -n emotalk python=3.8.8 +conda activate emotalk +pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113 +pip install -r requirements.txt +``` + + +## **Demo** +Download Blender and put it in this directory. +```bash +wget https://ftp.nluug.nl/pub/graphics/blender/release/Blender3.4/blender-3.4.1-linux-x64.tar.xz +tar -xf blender-3.4.1-linux-x64.tar.xz +mv blender-3.4.1-linux-x64 blender && rm blender-3.4.1-linux-x64.tar.xz +``` +Download the pretrained models from [EmoTalk.pth](https://drive.google.com/file/d/1KQZ-WGI9VDFLqgNXvJQosKVCbjTaCPqK/view?usp=drive_link) (Updated). Put the pretrained models under `pretrain_model` folder. +Put the audio under `aduio` folder and run +```bash +python demo.py --wav_path "./audio/disgust.wav" +``` +The generated animation will be saved in `result` folder. + + +## **Dataset** +If someone wants to download the 3D-ETF dataset, please fill in the [agreement](https://drive.google.com/file/d/1AQ5_focSgw9WiJdA2R44BQOrdTUe2ABd/view?usp=drive_link), and use the education mailbox to email Ziqiao Peng (pengziqiao@ruc.edu.cn) and cc Zhaoxin Fan (fanzhaoxin@psyai.net) to request the download link. + +## **Citation** +If you find this work useful for your research, please cite our paper: +``` +@InProceedings{Peng_2023_ICCV, + author = {Peng, Ziqiao and Wu, Haoyu and Song, Zhenbo and Xu, Hao and Zhu, Xiangyu and He, Jun and Liu, Hongyan and Fan, Zhaoxin}, + title = {EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation}, + booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)}, + month = {October}, + year = {2023}, + pages = {20687-20697} +} +``` + +## **Acknowledgement** +Here are some great resources we benefit: +- [Faceformer](https://github.com/EvelynFan/FaceFormer) for training pipeline +- [EVP](https://github.com/jixinya/EVP) for training dataloader +- [Speech-driven-expressions](https://github.com/YoungSeng/Speech-driven-expressions) for rendering +- [Wav2Vec2 Content](https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-english) and [Wav2Vec2 Emotion](https://huggingface.co/r-f/wav2vec-english-speech-emotion-recognition) for audio encoder +- [Head Template](http://filmicworlds.com/blog/solving-face-scans-for-arkit/) for visualization. + +Thanks to John Hable for sharing his head template under the CC0 license, which is very helpful for us to visualize the results. + +## **Contact** +For research purpose, such as comparison of experimental results, please contact pengziqiao@ruc.edu.cn + +For commercial licensing, please contact fanzhaoxin@psyai.net + +## **License** +This project is licensed under the Creative Commons Attribution-NonCommercial 4.0 International License. Please read the [LICENSE](LICENSE) file for more information. + +## **Invitation** + +We invite you to join [Psyche AI Inc](https://www.psyai.com/home) to conduct cutting-edge research and business implementation together. At Psyche AI Inc, we are committed to pushing the boundaries of what's possible in the fields of artificial intelligence and computer vision, especially their applications in avatars. As a member of our team, you will have the opportunity to collaborate with talented individuals, innovate new ideas, and contribute to projects that have a real-world impact. + +If you are passionate about working on the forefront of technology and making a difference, we would love to hear from you. Please visit our website at [Psyche AI Inc](https://www.psyai.com/home) to learn more about us and to apply for open positions. You can also contact us by fanzhaoxin@psyai.net. + +Let's shape the future together!! + + + diff --git a/emotalk_own/render.py b/emotalk_own/render.py new file mode 100644 index 0000000..c627ab1 --- /dev/null +++ b/emotalk_own/render.py @@ -0,0 +1,87 @@ +import bpy +import os +import numpy as np +import sys + +filename = str(sys.argv[-1]) +root_dir = str(sys.argv[-2]) + +model_bsList = ["browDownLeft", + "browDownRight", + "browInnerUp", + "browOuterUpLeft", + "browOuterUpRight", + "cheekPuff", + "cheekSquintLeft", + "cheekSquintRight", + "eyeBlinkLeft", + "eyeBlinkRight", + "eyeLookDownLeft", + "eyeLookDownRight", + "eyeLookInLeft", + "eyeLookInRight", + "eyeLookOutLeft", + "eyeLookOutRight", + "eyeLookUpLeft", + "eyeLookUpRight", + "eyeSquintLeft", + "eyeSquintRight", + "eyeWideLeft", + "eyeWideRight", + "jawForward", + "jawLeft", + "jawOpen", + "jawRight", + "mouthClose", + "mouthDimpleLeft", + "mouthDimpleRight", + "mouthFrownLeft", + "mouthFrownRight", + "mouthFunnel", + "mouthLeft", + "mouthLowerDownLeft", + "mouthLowerDownRight", + "mouthPressLeft", + "mouthPressRight", + "mouthPucker", + "mouthRight", + "mouthRollLower", + "mouthRollUpper", + "mouthShrugLower", + "mouthShrugUpper", + "mouthSmileLeft", + "mouthSmileRight", + "mouthStretchLeft", + "mouthStretchRight", + "mouthUpperUpLeft", + "mouthUpperUpRight", + "noseSneerLeft", + "noseSneerRight", + "tongueOut"] + +obj = bpy.data.objects["face"] + +bpy.context.scene.render.engine = 'BLENDER_WORKBENCH' +bpy.context.scene.display.shading.light = 'MATCAP' +bpy.context.scene.display.render_aa = 'FXAA' +bpy.context.scene.render.resolution_x = int(512) +bpy.context.scene.render.resolution_y = int(768) +bpy.context.scene.render.fps = 30 +bpy.context.scene.render.image_settings.file_format = 'PNG' + +cam = bpy.data.objects['Camera'] +cam.scale = [2, 2, 2] +bpy.context.scene.camera = cam + +output_dir = root_dir + filename +blendshape_path = root_dir + filename + '.npy' + +result = [] +bs = np.load(blendshape_path) + +for i in range(bs.shape[0]): + curr_bs = bs[i] + for j in range(52): + obj.data.shape_keys.key_blocks[model_bsList[j]].value = curr_bs[j] + bpy.context.scene.render.filepath = os.path.join(output_dir, '{}.png'.format(i)) + bpy.ops.render.render(write_still=True) diff --git a/emotalk_own/requirements.txt b/emotalk_own/requirements.txt new file mode 100644 index 0000000..735be5e --- /dev/null +++ b/emotalk_own/requirements.txt @@ -0,0 +1,5 @@ +numpy~=1.21.6 +transformers~=4.26.0 +tqdm~=4.64.1 +librosa~=0.10.0 +scipy~=1.9.1 \ No newline at end of file diff --git a/emotalk_own/utils.py b/emotalk_own/utils.py new file mode 100644 index 0000000..3283aae --- /dev/null +++ b/emotalk_own/utils.py @@ -0,0 +1,39 @@ +# Borrowed from https://github.com/EvelynFan/FaceFormer/blob/main/faceformer.py +import torch +import math + + +# Temporal Bias +def init_biased_mask(n_head, max_seq_len, period): + def get_slopes(n): + def get_slopes_power_of_2(n): + start = (2 ** (-2 ** -(math.log2(n) - 3))) + ratio = start + return [start * ratio ** i for i in range(n)] + + if math.log2(n).is_integer(): + return get_slopes_power_of_2(n) + else: + closest_power_of_2 = 2 ** math.floor(math.log2(n)) + return get_slopes_power_of_2(closest_power_of_2) + get_slopes(2 * closest_power_of_2)[0::2][ + :n - closest_power_of_2] + + slopes = torch.Tensor(get_slopes(n_head)) + bias = torch.arange(start=0, end=max_seq_len, step=period).unsqueeze(1).repeat(1, period).view(-1) // (period) + bias = - torch.flip(bias, dims=[0]) + alibi = torch.zeros(max_seq_len, max_seq_len) + for i in range(max_seq_len): + alibi[i, :i + 1] = bias[-(i + 1):] + alibi = slopes.unsqueeze(1).unsqueeze(1) * alibi.unsqueeze(0) + mask = (torch.triu(torch.ones(max_seq_len, max_seq_len)) == 1).transpose(0, 1) + mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) + mask = mask.unsqueeze(0) + alibi + return mask + + +# Alignment Bias +def enc_dec_mask(device, T, S): + mask = torch.ones(T, S).to(device) + for i in range(T): + mask[i, i] = 0 + return (mask == 1).to(device=device) diff --git a/emotalk_own/wav2vec.py b/emotalk_own/wav2vec.py new file mode 100755 index 0000000..c9f8090 --- /dev/null +++ b/emotalk_own/wav2vec.py @@ -0,0 +1,245 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from dataclasses import dataclass +from transformers import Wav2Vec2Model, Wav2Vec2PreTrainedModel +from transformers.modeling_outputs import BaseModelOutput +from typing import Optional, Tuple +from transformers.file_utils import ModelOutput +from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss + +_CONFIG_FOR_DOC = "Wav2Vec2Config" +_HIDDEN_STATES_START_POSITION = 2 + + +# the implementation of Wav2Vec2Model is borrowed from https://huggingface.co/transformers/_modules/transformers/models/wav2vec2/modeling_wav2vec2.html#Wav2Vec2Model +# initialize our encoder with the pre-trained wav2vec 2.0 weights. +def _compute_mask_indices( + shape: Tuple[int, int], + mask_prob: float, + mask_length: int, + attention_mask: Optional[torch.Tensor] = None, + min_masks: int = 0, +) -> np.ndarray: + bsz, all_sz = shape + mask = np.full((bsz, all_sz), False) + + all_num_mask = int( + mask_prob * all_sz / float(mask_length) + + np.random.rand() + ) + all_num_mask = max(min_masks, all_num_mask) + mask_idcs = [] + padding_mask = attention_mask.ne(1) if attention_mask is not None else None + for i in range(bsz): + if padding_mask is not None: + sz = all_sz - padding_mask[i].long().sum().item() + num_mask = int( + mask_prob * sz / float(mask_length) + + np.random.rand() + ) + num_mask = max(min_masks, num_mask) + else: + sz = all_sz + num_mask = all_num_mask + + lengths = np.full(num_mask, mask_length) + + if sum(lengths) == 0: + lengths[0] = min(mask_length, sz - 1) + + min_len = min(lengths) + if sz - min_len <= num_mask: + min_len = sz - num_mask - 1 + + mask_idc = np.random.choice(sz - min_len, num_mask, replace=False) + mask_idc = np.asarray([mask_idc[j] + offset for j in range(len(mask_idc)) for offset in range(lengths[j])]) + mask_idcs.append(np.unique(mask_idc[mask_idc < sz])) + + min_len = min([len(m) for m in mask_idcs]) + for i, mask_idc in enumerate(mask_idcs): + if len(mask_idc) > min_len: + mask_idc = np.random.choice(mask_idc, min_len, replace=False) + mask[i, mask_idc] = True + return mask + + +# linear interpolation layer +def linear_interpolation(features, input_fps, output_fps, output_len=None): + features = features.transpose(1, 2) + seq_len = features.shape[2] / float(input_fps) + if output_len is None: + output_len = int(seq_len * output_fps) + output_features = F.interpolate(features, size=output_len, align_corners=True, mode='linear') + return output_features.transpose(1, 2) + + +class Wav2Vec2Model(Wav2Vec2Model): + def __init__(self, config): + super().__init__(config) + self.lm_head = nn.Linear(1024, 32) + + def forward( + self, + input_values, + attention_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + frame_num=None + ): + self.config.output_attentions = True + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + hidden_states = self.feature_extractor(input_values) + hidden_states = hidden_states.transpose(1, 2) + + hidden_states = linear_interpolation(hidden_states, 50, 30, output_len=frame_num) + + if attention_mask is not None: + output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)) + attention_mask = torch.zeros( + hidden_states.shape[:2], dtype=hidden_states.dtype, device=hidden_states.device + ) + attention_mask[ + (torch.arange(attention_mask.shape[0], device=hidden_states.device), output_lengths - 1) + ] = 1 + attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool() + + hidden_states = self.feature_projection(hidden_states)[0] + + encoder_outputs = self.encoder( + hidden_states, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = encoder_outputs[0] + if not return_dict: + return (hidden_states,) + encoder_outputs[1:] + + return BaseModelOutput( + last_hidden_state=hidden_states, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +@dataclass +class SpeechClassifierOutput(ModelOutput): + loss: Optional[torch.FloatTensor] = None + logits: torch.FloatTensor = None + hidden_states: Optional[Tuple[torch.FloatTensor]] = None + attentions: Optional[Tuple[torch.FloatTensor]] = None + + +class Wav2Vec2ClassificationHead(nn.Module): + """Head for wav2vec classification task.""" + + def __init__(self, config): + super().__init__() + self.dense = nn.Linear(config.hidden_size, config.hidden_size) + self.dropout = nn.Dropout(config.final_dropout) + self.out_proj = nn.Linear(config.hidden_size, config.num_labels) + + def forward(self, features, **kwargs): + x = features + x = self.dropout(x) + x = self.dense(x) + x = torch.tanh(x) + x = self.dropout(x) + x = self.out_proj(x) + return x + + +class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel): + def __init__(self, config): + super().__init__(config) + self.num_labels = config.num_labels + self.pooling_mode = config.pooling_mode + self.config = config + + self.wav2vec2 = Wav2Vec2Model(config) + self.classifier = Wav2Vec2ClassificationHead(config) + + self.init_weights() + + def freeze_feature_extractor(self): + self.wav2vec2.feature_extractor._freeze_parameters() + + def merged_strategy( + self, + hidden_states, + mode="mean" + ): + if mode == "mean": + outputs = torch.mean(hidden_states, dim=1) + elif mode == "sum": + outputs = torch.sum(hidden_states, dim=1) + elif mode == "max": + outputs = torch.max(hidden_states, dim=1)[0] + else: + raise Exception( + "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']") + + return outputs + + def forward( + self, + input_values, + attention_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + labels=None, + frame_num=None, + ): + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + outputs = self.wav2vec2( + input_values, + attention_mask=attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + hidden_states = outputs[0] + hidden_states1 = linear_interpolation(hidden_states, 50, 30, output_len=frame_num) + hidden_states = self.merged_strategy(hidden_states1, mode=self.pooling_mode) + logits = self.classifier(hidden_states) + + loss = None + if labels is not None: + if self.config.problem_type is None: + if self.num_labels == 1: + self.config.problem_type = "regression" + elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int): + self.config.problem_type = "single_label_classification" + else: + self.config.problem_type = "multi_label_classification" + + if self.config.problem_type == "regression": + loss_fct = MSELoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels) + elif self.config.problem_type == "single_label_classification": + loss_fct = CrossEntropyLoss() + loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1)) + elif self.config.problem_type == "multi_label_classification": + loss_fct = BCEWithLogitsLoss() + loss = loss_fct(logits, labels) + + if not return_dict: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return SpeechClassifierOutput( + loss=loss, + logits=logits, + hidden_states=hidden_states1, + attentions=outputs.attentions, + ) diff --git a/main.py b/main.py new file mode 100644 index 0000000..382d5c9 --- /dev/null +++ b/main.py @@ -0,0 +1,4 @@ +from piedemo.web import Web +from piedemo.fields.ajax_group import AjaxChatField + + diff --git a/miapia_own/FemAdv_b350_V2_050523.py b/miapia_own/FemAdv_b350_V2_050523.py new file mode 100644 index 0000000..a7ac744 --- /dev/null +++ b/miapia_own/FemAdv_b350_V2_050523.py @@ -0,0 +1,164 @@ +import random +import shutil + +model_bsList_old = ["browDownLeft", + "browDownRight", + "browInnerUp", + "browOuterUpLeft", + "browOuterUpRight", + "cheekPuff", + "cheekSquintLeft", + "cheekSquintRight", + "eyeBlinkLeft", + "eyeBlinkRight", + "eyeLookDownLeft", + "eyeLookDownRight", + "eyeLookInLeft", + "eyeLookInRight", + "eyeLookOutLeft", + "eyeLookOutRight", + "eyeLookUpLeft", + "eyeLookUpRight", + "eyeSquintLeft", + "eyeSquintRight", + "eyeWideLeft", + "eyeWideRight", + "jawForward", + "jawLeft", + "jawOpen", + "jawRight", + "mouthClose", + "mouthDimpleLeft", + "mouthDimpleRight", + "mouthFrownLeft", + "mouthFrownRight", + "mouthFunnel", + "mouthLeft", + "mouthLowerDownLeft", + "mouthLowerDownRight", + "mouthPressLeft", + "mouthPressRight", + "mouthPucker", + "mouthRight", + "mouthRollLower", + "mouthRollUpper", + "mouthShrugLower", + "mouthShrugUpper", + "mouthSmileLeft", + "mouthSmileRight", + "mouthStretchLeft", + "mouthStretchRight", + "mouthUpperUpLeft", + "mouthUpperUpRight", + "noseSneerLeft", + "noseSneerRight", + "tongueOut"] + +import bpy +import os +import numpy as np +import sys + +filename = str(sys.argv[-1]) +root_dir = str(sys.argv[-2]) + +object_name = "MFA_body" +obj = bpy.data.objects[object_name] + +bpy.context.scene.render.engine = 'BLENDER_WORKBENCH' +bpy.context.scene.display.shading.light = 'MATCAP' +bpy.context.scene.display.render_aa = 'FXAA' +bpy.context.scene.render.resolution_x = int(512) +bpy.context.scene.render.resolution_y = int(768) +bpy.context.scene.render.fps = 30 +bpy.context.scene.render.image_settings.file_format = 'PNG' + +cam = bpy.data.objects['0Camera'] +cam.scale = [2, 2, 2] +bpy.context.scene.camera = cam + +""" +model_bsList = ['Basis', + '0', + 'X_postrig', + 'X_neck', + 'X_head', + 'X_eyesfix', + 'X_breast', + 'X_nails', + 'X_pus_conf.1', + 'X_pus_assym', 'X_jadafication', + 'X_facetweak', 'X_eyeshape', + 'A_nipple_in', 'A_nailsmax', + 'A_pregnant', 'PAD_breathe', + 'PAD_swallow', 'Head', + 'cr_neck1', 'cr_neck2', + 'cr_neck3.R', 'cr_neck3.L', + 'cr_neck4.L', 'cr_neck4.R', 'cr_jaw1', 'cr_jaw2', 'sqz_jaw3', 'cr_brows_dwn', 'cr_brows_up', + 'cr_eye_lookdown', 'cr_eye_open', + 'cr_eye_look.L', 'cr_eye_look.R', 'cr_mouthmax.L', 'cr_mouthmax.R', 'cr_cheekin.L', 'cr_cheekin.R', 'Body', 'cr_spine', + 'cr_spine2', 'cr_spine3', 'cr_spine2.L', + 'cr_spine2.R', 'cr_spine4.L', 'cr_spine4.R', + 'cr_spine5.L', 'cr_spine5.R', 'cr_lowerspine.bcw', + 'cr_lowerspine.fwd', 'size_breastXL.L', 'size_breastXL.R', + 'size_breastXS.L', 'size_breastXS.R', 'size_oreola.L', + 'size_oreola.R', 'Legs', 'cr_hipout.L', 'cr_hipout.R', + 'cr_hipin.L', 'cr_hipin.R', 'cr_pussyflattern', + 'cr_hip0.L', 'cr_hip0.R', 'cr_hip1.L', 'cr_hip1.R', + 'cr_hip45.L', 'cr_hip45.R', 'sqz_hip1max.L', + 'sqz_hip1max.R', 'sqz_hip1vol.L', 'sqz_hip1vol.R', + 'sqz_hip1squeeze.L', 'sqz_hip1squeeze.R', 'cr_hip2.L', + 'cr_hip2.R', 'sqz_hip2.L', 'sqz_hip2.R', 'cr_hip3.L', + 'cr_hip3.R', 'sqz_buttrest.L', 'sqz_buttrest.R', + 'cr_knee45.L', 'cr_knee45.R', 'cr_knee.L', 'cr_knee.R', + 'sqz_knee.L', 'sqz_knee.R', 'sqz_stance.L', 'sqz_stance.R', + 'cr_buttheart.L', 'cr_buttheart.R', 'rest_buttcheek.L', + 'rest_buttcheek.R', 'rest_knee.L', 'rest_knee.R', 'rest_knee_fat.L', + 'rest_knee_fat.R', 'rest_hip.L', 'rest_hip.R', 'vol_butt.L', + 'vol_butt.R', 'Feet', 'cr_feet1.L', 'cr_feet1.R', 'cr_feet2.L', + 'cr_feet2.R', 'cr_feet3.L', 'cr_feet3.R', 'cr_toe1.L', 'cr_toe1.R', + 'cr_toe2.L', 'cr_toe2.R', 'Arms', 'cr_arm-up.L', 'cr_arm-up.R', + 'cr_arm-fwd.L', 'cr_arm-fwd.R', 'cr_arm-dwn.L', 'cr_arm-dwn.R', + 'sqz_arm-fwd.L', 'sqz_arm-fwd.R', 'sqz_armpit.L', 'sqz_armpit.R', + 'sqz_arm-bcw.L', 'sqz_arm-bcw.R', 'sqz_arm-bcw_max.L', + 'sqz_arm-bcw_max.R', 'cr_arm-trc.L', 'cr_arm-trc.R', + 'D_cr_elbow.L', 'U_cr_elbow.L', 'D_cr_elbow.R', 'U_cr_elbow.R', + 'D_sqz_elbowMax.L', 'U_sqz_elbowMax.L', 'D_sqz_elbowMax.R', + 'U_sqz_elbowMax.R', 'cr_armrest.L', 'cr_armrest.R', + 'cr_shoulder_fwd.L', 'cr_shoulder_fwd.R', 'cr_shoulder_bcw.L', + 'cr_shoulder_bcw.R', 'cr_shoulder_dwn.L', 'cr_shoulder_dwn.R', + 'cr_shoulder_up.L', 'cr_shoulder_up.R', 'rest_elbow.L', 'rest_elbow.R', + 'Hands', 'cr_hand1.L', 'cr_hand1.R', + 'cr_hand2.L', 'cr_hand2.R', 'cr_handtwistU.L', 'cr_handtwistU.R', + 'cr_handtwistD.L', + 'cr_handtwistD.R', + 'cr_thumb.01.L', 'cr_thumb.01.R', + 'cr_f_index.01.L', 'cr_f_index.01.R', 'cr_f_index.02.L', + 'cr_f_index.02.R', + 'cr_f_middle.01.L', 'cr_f_middle.01.R', 'cr_f_middle.02.L', + 'cr_f_middle.02.R', 'cr_f_ring.01.L', 'cr_f_ring.01.R', + 'cr_f_ring.02.L', 'cr_f_ring.02.R', 'cr_f_pinky.01.L', + 'cr_f_pinky.01.R', 'cr_f_pinky.02.L', 'cr_f_pinky.02.R', 'EM', + 'em_eye_close.L', 'em_eye_close.R', 'em_eye_half.L', 'em_eye_half.R', + 'em_smile_open', 'em_smile_close', 'em_kiss', 'em_disg', 'em_blow', + 'em_surprise', 'em_sad', 'em_frown', 'PH', 'ph_+', 'ph_bpm', + 'ph_fv', 'ph_ou', + 'ph_e', 'ph_r', 'ph_ch', 'ph_th', 'ph_a']""" + +model_bsList = list(obj.data.shape_keys.key_blocks.keys()) + + +# print(obj.data.shape_keys.key_blocks.keys()) + +output_dir = root_dir + filename +blendshape_path = root_dir + filename + '.npy' + +result = [] +bs = np.load(blendshape_path) + +for i in range(10): + for kp_name in model_bsList: + obj.data.shape_keys.key_blocks[kp_name].value = random.random() + bpy.context.scene.render.filepath = os.path.join(output_dir, + '{}.png'.format(i)) + bpy.ops.render.render(write_still=True) diff --git a/miapia_own/__init__.py b/miapia_own/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/miapia_own/a.py b/miapia_own/a.py new file mode 100644 index 0000000..d4f3a98 --- /dev/null +++ b/miapia_own/a.py @@ -0,0 +1,57 @@ +import bpy +import os +import numpy as np +import sys + +filename = str(sys.argv[-1]) +root_dir = str(sys.argv[-2]) + +object_name = "MFA_body" +obj = bpy.data.objects[object_name] + +bpy.context.scene.render.engine = 'BLENDER_WORKBENCH' +bpy.context.scene.display.shading.light = 'MATCAP' +bpy.context.scene.display.render_aa = 'FXAA' +bpy.context.scene.render.resolution_x = int(512) +bpy.context.scene.render.resolution_y = int(768) +bpy.context.scene.render.fps = 30 +bpy.context.scene.render.image_settings.file_format = 'PNG' + +cam = bpy.data.objects['0Camera'] +cam.scale = [2, 2, 2] +bpy.context.scene.camera = cam + + +model_bsList = ['Basis', + '0', + 'X_postrig', + 'X_neck', + 'X_head', + 'X_eyesfix', + 'X_breast', + 'X_nails', + 'X_pus_conf.1', + 'X_pus_assym', 'X_jadafication', + 'X_facetweak', 'X_eyeshape', + 'A_nipple_in', 'A_nailsmax', + 'A_pregnant', 'PAD_breathe', + 'PAD_swallow', 'Head', + 'cr_neck1', 'cr_neck2', + 'cr_neck3.R', 'cr_neck3.L', + 'cr_neck4.L', 'cr_neck4.R', 'cr_jaw1', 'cr_jaw2', 'sqz_jaw3', 'cr_brows_dwn', 'cr_brows_up', + 'cr_eye_lookdown', 'cr_eye_open', + 'cr_eye_look.L', 'cr_eye_look.R', 'cr_mouthmax.L', 'cr_mouthmax.R', 'cr_cheekin.L', 'cr_cheekin.R', 'Body', 'cr_spine', 'cr_spine2', 'cr_spine3', 'cr_spine2.L', 'cr_spine2.R', 'cr_spine4.L', 'cr_spine4.R', 'cr_spine5.L', 'cr_spine5.R', 'cr_lowerspine.bcw', 'cr_lowerspine.fwd', 'size_breastXL.L', 'size_breastXL.R', 'size_breastXS.L', 'size_breastXS.R', 'size_oreola.L', 'size_oreola.R', 'Legs', 'cr_hipout.L', 'cr_hipout.R', 'cr_hipin.L', 'cr_hipin.R', 'cr_pussyflattern', 'cr_hip0.L', 'cr_hip0.R', 'cr_hip1.L', 'cr_hip1.R', 'cr_hip45.L', 'cr_hip45.R', 'sqz_hip1max.L', 'sqz_hip1max.R', 'sqz_hip1vol.L', 'sqz_hip1vol.R', 'sqz_hip1squeeze.L', 'sqz_hip1squeeze.R', 'cr_hip2.L', 'cr_hip2.R', 'sqz_hip2.L', 'sqz_hip2.R', 'cr_hip3.L', 'cr_hip3.R', 'sqz_buttrest.L', 'sqz_buttrest.R', 'cr_knee45.L', 'cr_knee45.R', 'cr_knee.L', 'cr_knee.R', 'sqz_knee.L', 'sqz_knee.R', 'sqz_stance.L', 'sqz_stance.R', 'cr_buttheart.L', 'cr_buttheart.R', 'rest_buttcheek.L', 'rest_buttcheek.R', 'rest_knee.L', 'rest_knee.R', 'rest_knee_fat.L', 'rest_knee_fat.R', 'rest_hip.L', 'rest_hip.R', 'vol_butt.L', 'vol_butt.R', 'Feet', 'cr_feet1.L', 'cr_feet1.R', 'cr_feet2.L', 'cr_feet2.R', 'cr_feet3.L', 'cr_feet3.R', 'cr_toe1.L', 'cr_toe1.R', 'cr_toe2.L', 'cr_toe2.R', 'Arms', 'cr_arm-up.L', 'cr_arm-up.R', 'cr_arm-fwd.L', 'cr_arm-fwd.R', 'cr_arm-dwn.L', 'cr_arm-dwn.R', 'sqz_arm-fwd.L', 'sqz_arm-fwd.R', 'sqz_armpit.L', 'sqz_armpit.R', 'sqz_arm-bcw.L', 'sqz_arm-bcw.R', 'sqz_arm-bcw_max.L', 'sqz_arm-bcw_max.R', 'cr_arm-trc.L', 'cr_arm-trc.R', 'D_cr_elbow.L', 'U_cr_elbow.L', 'D_cr_elbow.R', 'U_cr_elbow.R', 'D_sqz_elbowMax.L', 'U_sqz_elbowMax.L', 'D_sqz_elbowMax.R', 'U_sqz_elbowMax.R', 'cr_armrest.L', 'cr_armrest.R', 'cr_shoulder_fwd.L', 'cr_shoulder_fwd.R', 'cr_shoulder_bcw.L', 'cr_shoulder_bcw.R', 'cr_shoulder_dwn.L', 'cr_shoulder_dwn.R', 'cr_shoulder_up.L', 'cr_shoulder_up.R', 'rest_elbow.L', 'rest_elbow.R', 'Hands', 'cr_hand1.L', 'cr_hand1.R', 'cr_hand2.L', 'cr_hand2.R', 'cr_handtwistU.L', 'cr_handtwistU.R', 'cr_handtwistD.L', 'cr_handtwistD.R', 'cr_thumb.01.L', 'cr_thumb.01.R', 'cr_f_index.01.L', 'cr_f_index.01.R', 'cr_f_index.02.L', 'cr_f_index.02.R', 'cr_f_middle.01.L', 'cr_f_middle.01.R', 'cr_f_middle.02.L', 'cr_f_middle.02.R', 'cr_f_ring.01.L', 'cr_f_ring.01.R', 'cr_f_ring.02.L', 'cr_f_ring.02.R', 'cr_f_pinky.01.L', 'cr_f_pinky.01.R', 'cr_f_pinky.02.L', 'cr_f_pinky.02.R', 'EM', 'em_eye_close.L', 'em_eye_close.R', 'em_eye_half.L', 'em_eye_half.R', 'em_smile_open', 'em_smile_close', 'em_kiss', 'em_disg', 'em_blow', 'em_surprise', 'em_sad', 'em_frown', 'PH', 'ph_+', 'ph_bpm', 'ph_fv', 'ph_ou', 'ph_e', 'ph_r', 'ph_ch', 'ph_th', 'ph_a'] + +# print(obj.data.shape_keys.key_blocks.keys()) + +output_dir = root_dir + filename +blendshape_path = root_dir + filename + '.npy' + +result = [] +bs = np.load(blendshape_path) + +for i in range(bs.shape[0]): + obj.data.shape_keys.key_blocks['cr_eye_open'].value = i / bs.shape[0] + bpy.context.scene.render.filepath = os.path.join(output_dir, + '{}.png'.format(i)) + bpy.ops.render.render(write_still=True) diff --git a/miapia_own/aihandler.py b/miapia_own/aihandler.py new file mode 100644 index 0000000..9ec193a --- /dev/null +++ b/miapia_own/aihandler.py @@ -0,0 +1,36 @@ + +import requests + + +class AIHandler(object): + def __init__(self): + pass + + def __call__(self, text): + resp = requests.post("https://fast-pia.avemio.technology/chat-completion", + json={ + "session-id": "chatcmpl", + "user-location": "Zweibrücken", + "wheel-of-life": [ + { + "personal_growth": 10, + "health_exercise": 5, + "familiy_friends": 5, + "romance_relationship": 5, + "career_work": 5, + "finances": 5, + "recreation_fun": 5, + "living_situation": 5} + ], + "messages": [ + { + "role": "user", + "content": text + } + ] + }) + resp = resp.json() + return { + "text": resp[0]['text'], + "emotion": resp[0]['emotion'] + } diff --git a/miapia_own/main.py b/miapia_own/main.py new file mode 100644 index 0000000..b265b50 --- /dev/null +++ b/miapia_own/main.py @@ -0,0 +1,243 @@ +import sys + +import pandas as pd +import argparse +import base64 + +from flask import send_file, Response +from flask_socketio import emit +from piedemo.fields.ajax_group import AjaxChatField, AjaxGroup +from piedemo.fields.grid import VStack, HStack, SpaceField +from piedemo.fields.inputs.hidden import InputHiddenField +from piedemo.fields.outputs.colored_text import ptext, OutputColoredTextField +from piedemo.fields.outputs.json import OutputJSONField +from piedemo.fields.outputs.progress import ProgressField +from piedemo.fields.outputs.video import OutputVideoField +from piedemo.web import Web +import os +import io +from piedemo.page import Page +from piedemo.hub.svgpil import SVGImage +from piedemo.fields.outputs.table import OutputTableField +from piedemo.fields.inputs.int_list import InputIntListField +from piedemo.fields.navigation import Navigation +from piedemo.fields.inputs.chat import ChatField +import librosa +import uuid +import numpy as np +import redis +import argparse +from scipy.signal import savgol_filter +import torch +import random +import os, subprocess +import shlex + +from tqdm import tqdm + +from aihandler import AIHandler +from pieinfer import PieInfer, render_video, construct_video +import torch +from TTS.api import TTS + +# Get device +device = "cuda" if torch.cuda.is_available() else "cpu" + + +def get_asset(fname): + return SVGImage.open(os.path.join(os.path.dirname(__file__), + "assets", + fname)).svg_content + + +class MainPage(Page): + def __init__(self, model_name: str): + super(MainPage, self).__init__() + self.infer = PieInfer() + self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) + + self.r = redis.Redis(host='localhost', port=6379, decode_responses=True) + self.aihandler = AIHandler() + + self.fields = Navigation(AjaxGroup("ChatGroup", VStack([ + HStack([ + AjaxChatField("Chat", + self.register_ajax(f"/refresh_{model_name}", + self.message_sent), + deps_names=["sid", + "session_id", + "Chat", + "Chat__piedemo__file"], + use_socketio_support=True, + nopie=True, + style={ + "height": "100%" + }), + OutputColoredTextField("video", + nopie=True, + use_socketio_support=True), + ], xs=[8, 4]), + ProgressField("progress", + nopie=True, + use_socketio_support=True), + InputHiddenField("session_id", None), + ]), no_return=True), no_submit=True, page_title="MIA PIA", page_style={ + + }) + self.fields.add_link("SIMPLE", + "/simple", + active=model_name == "render") + self.fields.add_link("MIA PIA", + "/nice", + active=model_name != "render") + self.model_name = model_name + + def get_content(self, **kwargs): + fields = self.fields.copy() + fields.child_loc["Chat"].set_default_options(["Hello! What is your name?", "Say one word and stop."]) + """ + fields.child_loc["Chat"].set_avatars({ + "self": get_asset("avatar.svg"), + "ChatGPT": get_asset("dog.svg"), + }) + """ + session_id = str(uuid.uuid4()) + return self.fill(fields, { + "video": f""" + """, + "session_id": session_id, + }) + + def message_sent(self, **data): + sid = data['sid'] + self.emit(self.fields.child_loc["Chat"].clear_input(), + to=sid) + self.emit(self.fields.child_loc["video"].update(f""" + """)) + data = self.parse(self.fields, data) + session_id = data['session_id'] + messages_map = self.r.hgetall(f'user-session:{session_id}') + messages = [self.fields.child_loc["Chat"].format_message("self" if i % 2 == 0 else "ChatGPT", + messages_map[f"message_{i}"]) + for i in range(len(messages_map))] + + print("history: ", messages) + + text = data['Chat']['text'] + + self.emit(self.fields.child_loc["Chat"].update(messages + [ + self.fields.child_loc["Chat"].format_message("self", text), + self.fields.child_loc["Chat"].format_message("ChatGPT", "Generating text..."), + ]), to=sid) + + output = self.aihandler(text) + output_text = output['text'] + output_emotion = output['emotion'] + + messages_map[f"message_{len(messages)}"] = text + messages_map[f"message_{len(messages) + 1}"] = output_text + self.r.hset(f'user-session:{session_id}', mapping=messages_map) + + self.emit(self.fields.child_loc["Chat"].update(messages + [ + self.fields.child_loc["Chat"].format_message("self", text), + self.fields.child_loc["Chat"].format_message("ChatGPT", "Generating audio..."), + ]), to=sid) + + self.tts.tts_to_file(text=output_text, + speaker_wav="/home/ubuntu/repo/of_couse_here.wav", + language="en", + emotion=output_emotion, + file_path=f"./audio/{session_id}.wav") + speech_array, sampling_rate = librosa.load(f"./audio/{session_id}.wav", + sr=16000) + output = self.infer(speech_array, sampling_rate) + np.save(os.path.join("./audio", "{}.npy".format(session_id)), + output) + + self.emit(self.fields.child_loc["Chat"].update(messages + [ + self.fields.child_loc["Chat"].format_message("self", text), + self.fields.child_loc["Chat"].format_message("ChatGPT", "Rendering..."), + ]), to=sid) + + n = output.shape[0] + for i, fname in enumerate(tqdm(render_video(f"{session_id}", + model_name=self.model_name), + total=n)): + print("Got frame: ", fname, file=sys.stderr) + self.emit(self.fields.child_loc["progress"].update(100 * i // n), + to=sid) + construct_video(session_id) + + self.emit(self.fields.child_loc["video"].update(f""" + + """), to=sid) + + '''self.emit(self.fields.child_loc["video"].update(f""" + + """))''' + self.emit(self.fields.child_loc["Chat"].update(messages + [ + self.fields.child_loc["Chat"].format_message("self", text), + self.fields.child_loc["Chat"].format_message("ChatGPT", output_text), + ]), to=sid) + + +web = Web({ + "": "simple", + "simple": MainPage("render"), + "nice": MainPage("FemAdv_b350_V2_050523"), +}, use_socketio_support=True) + + +host = '0.0.0.0' +port = 8011 +debug = False +app = web.get_app() + + +@app.route("/api/video/", methods=["GET"]) +def get_video(session_id): + return send_file("./audio/{}.mp4".format(session_id)) + + +def gen(session_id): + for image_path in render_video(f"{session_id}"): + with open(image_path, 'rb') as f: + yield (b'--frame\r\n' + b'Content-Type: image/jpeg\r\n\r\n' + f.read() + b'\r\n') + construct_video(session_id) + + +@app.route("/api/video/stream/", methods=["GET"]) +def get_video_async(session_id): + return Response(gen(session_id), + mimetype='multipart/x-mixed-replace; boundary=frame') + + +io = web.get_socketio(app) + + +@io.on("io_set_text") +def io_set_text(data): + sid = None + if "text" not in data: + emit("io_error", {"message": "Text not found"}, + to=sid) + + encode_string = base64.b64encode(open("../feeling_good.wav", "rb").read()) + for i in range(10): + j = random.randint(0, 2) + emit("io_set_coef", [{ + "index": j, + "value": i / 10, + }], to=sid) + emit("io_push_audio_blob", { + "dataURL": f"base64,{encode_string}" + }, to=sid) + emit("io_finish", {}, to=sid) + + +io.run(app, + host=host, port=port, debug=debug, + allow_unsafe_werkzeug=True) diff --git a/miapia_own/pieinfer.py b/miapia_own/pieinfer.py new file mode 100644 index 0000000..690b4f6 --- /dev/null +++ b/miapia_own/pieinfer.py @@ -0,0 +1,153 @@ +import librosa +import numpy as np +import argparse + +from parse import parse +from scipy.signal import savgol_filter +import torch +from model import EmoTalk +import random +import os, subprocess +import shlex +from munch import Munch + + +@torch.no_grad() +def test(model, speech_array, sampling_rate): + args = Munch( + bs_dim=52, + feature_dim=832, + period=30, + device="cuda", + model_path="./pretrain_model/EmoTalk.pth", + max_seq_len=5000, + num_workers=0, + batch_size=1, + post_processing=True, + blender_path="./blender/blender") + + eye1 = np.array([0.36537236, 0.950235724, 0.95593375, 0.916715622, 0.367256105, 0.119113259, 0.025357503]) + eye2 = np.array([0.234776169, 0.909951985, 0.944758058, 0.777862132, 0.191071674, 0.235437036, 0.089163929]) + eye3 = np.array([0.870040774, 0.949833691, 0.949418545, 0.695911646, 0.191071674, 0.072576277, 0.007108896]) + eye4 = np.array([0.000307991, 0.556701422, 0.952656746, 0.942345619, 0.425857186, 0.148335218, 0.017659493]) + # speech_array, sampling_rate = librosa.load(os.path.join(wav_path), sr=16000) + audio = torch.FloatTensor(speech_array).unsqueeze(0).to(args.device) + level = torch.tensor([1]).to(args.device) + person = torch.tensor([0]).to(args.device) + prediction = model.predict(audio, level, person) + prediction = prediction.squeeze().detach().cpu().numpy() + if args.post_processing: + output = np.zeros((prediction.shape[0], prediction.shape[1])) + for i in range(prediction.shape[1]): + output[:, i] = savgol_filter(prediction[:, i], 5, 2) + output[:, 8] = 0 + output[:, 9] = 0 + i = random.randint(0, 60) + while i < output.shape[0] - 7: + eye_num = random.randint(1, 4) + if eye_num == 1: + output[i:i + 7, 8] = eye1 + output[i:i + 7, 9] = eye1 + elif eye_num == 2: + output[i:i + 7, 8] = eye2 + output[i:i + 7, 9] = eye2 + elif eye_num == 3: + output[i:i + 7, 8] = eye3 + output[i:i + 7, 9] = eye3 + else: + output[i:i + 7, 8] = eye4 + output[i:i + 7, 9] = eye4 + time1 = random.randint(60, 180) + i = i + time1 + return output + else: + return prediction + + +def render_video(wav_name, model_name): + args = Munch( + bs_dim=52, + feature_dim=832, + period=30, + device="cuda", + model_path="./pretrain_model/EmoTalk.pth", + max_seq_len=5000, + num_workers=0, + batch_size=1, + post_processing=True, + blender_path="./blender/blender") + + # wav_name = args.wav_path.split('/')[-1].split('.')[0] + image_path = os.path.join("./audio", wav_name) + os.makedirs(image_path, exist_ok=True) + blender_path = args.blender_path + + python_path = f"./{model_name}.py" + blend_path = f"./{model_name}.blend" + print(python_path, blend_path) + # python_path = "./render.py" + # blend_path = "./render.blend" + cmd = '{} -t 64 -b {} -P {} -- "{}" "{}" '.format(blender_path, + blend_path, + python_path, + "./audio/", + wav_name) + cmd = shlex.split(cmd) + p = subprocess.Popen(cmd, + shell=False, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + while p.poll() is None: + line = p.stdout.readline().decode('utf-8') + line = line.strip() + if line and line.startswith('Saved: '): + fname = parse("Saved: '{}'", line).fixed[0] + yield fname + else: + print(line) + + if p.returncode == 0: + print('Subprogram success') + else: + print('Subprogram failed') + + +def construct_video(wav_name): + image_path = os.path.join("./audio", wav_name) + os.makedirs(image_path, exist_ok=True) + image_temp = image_path + "/%d.png" + output_path = os.path.join("./audio", wav_name + ".mp4") + cmd = 'ffmpeg -r 30 -i "{}" -i "{}" -pix_fmt yuv420p -s 512x768 "{}" -y'.format(image_temp, + f"./audio/{wav_name}.wav", + output_path) + subprocess.call(cmd, shell=True) + cmd = 'rm -rf "{}"'.format(image_path) + subprocess.call(cmd, shell=True) + + +class PieInfer(object): + def __init__(self): + args = Munch( + bs_dim=52, + feature_dim=832, + period=30, + device="cuda", + model_path="./pretrain_model/EmoTalk.pth", + max_seq_len=5000, + num_workers=0, + batch_size=1, + post_processing=True, + blender_path="./blender/blender") + #""" + model = EmoTalk(args) + model.load_state_dict(torch.load(args.model_path, map_location=torch.device(args.device)), strict=False) + model = model.to(args.device) + model.eval() + #""" + # model = None + self.model = model + + def __call__(self, + speech_array, + sampling_rate): + return test(self.model, speech_array, sampling_rate) diff --git a/miapia_stream/__init__.py b/miapia_stream/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/server/connect.sh b/server/connect.sh new file mode 100755 index 0000000..02c4a07 --- /dev/null +++ b/server/connect.sh @@ -0,0 +1 @@ +ssh -i ~/.ssh/id_rsa_miapia ubuntu@54.172.214.227 diff --git a/server/sync_code.sh b/server/sync_code.sh new file mode 100755 index 0000000..01ce104 --- /dev/null +++ b/server/sync_code.sh @@ -0,0 +1 @@ +rsync --rsh='ssh -i ~/.ssh/id_rsa_miapia -o IdentitiesOnly=yes' --exclude='.git' --exclude 'node_modules' -Pav ../miapia_own/ ubuntu@54.172.214.227:/home/ubuntu/repo/EmoTalk_release diff --git a/server/sync_code_mia.sh b/server/sync_code_mia.sh new file mode 100755 index 0000000..ad83194 --- /dev/null +++ b/server/sync_code_mia.sh @@ -0,0 +1 @@ +rsync --rsh='ssh -i ~/.ssh/id_rsa_miapia -o IdentitiesOnly=yes' --exclude='.git' --exclude 'node_modules' -Pav ../ ubuntu@54.172.214.227:/home/ubuntu/repo/ diff --git a/t2a_api.py b/t2a_api.py new file mode 100644 index 0000000..bd536ea --- /dev/null +++ b/t2a_api.py @@ -0,0 +1,12 @@ +from transformers import pipeline +import scipy + + +class T2A(object): + def __init__(self): + self.synthesiser = pipeline("text-to-speech", "suno/bark") + + def apply(self, text): + speech = self.synthesiser(text, + forward_params={"do_sample": True}) + scipy.io.wavfile.write("bark_out.wav", rate=speech["sampling_rate"], data=speech["audio"]) diff --git a/test_a2f_api.py b/test_a2f_api.py new file mode 100644 index 0000000..69e0e93 --- /dev/null +++ b/test_a2f_api.py @@ -0,0 +1,38 @@ +import math +import os +import requests +from pprint import pprint +import argparse +import soundfile +from a2f_api import A2F + + +parser = argparse.ArgumentParser() +parser.add_argument("audio_path") +parser.add_argument("--host", type=str, + default="https://a2fdemo.piedata.ai/") +args = parser.parse_args() + + +a2f = A2F(args.host) +print(f"Uploading {args.audio_path}...") +server_audio_path = a2f.upload(args.audio_path) +fname = os.path.basename(server_audio_path) +print("Status: ", a2f.status()) +print("EmotionNames: ", a2f.get_emotion_names()) +print("Scene Objects: ", a2f.get_scene_objects()) +print("Scene Players: ", a2f.get_players()) +print("Preprocessing settings: ", a2f.get_pre_settings()) +print("Postprocessing settings: ", a2f.get_post_settings()) +print("Setting player root: ", a2f.set_player_root("/home/ubuntu/results")) +print("Player root: ", a2f.get_player_root()) +print("Setting audio: ", a2f.set_audio(os.path.basename(server_audio_path))) +print("Audio Range: ", a2f.get_audio_range()) +print("Running: ", a2f.run()) +print("NumKeys: ", a2f.get_number_of_keys()) +print("Keys: ", a2f.get_generated_keys()) +# print("BlendShape solvers: ", a2f.get_blendshape_solvers()) +print("Exporting: ", a2f.export_json("/home/ubuntu/results", + filename=os.path.splitext(fname)[0])) +print(f"Pulling to ./{fname}...") +a2f.pull(fname)