first commit

This commit is contained in:
George Kasparyants
2024-06-14 00:47:32 +03:00
commit 7591784e34
31 changed files with 3029 additions and 0 deletions

39
.gitignore vendored Executable file
View File

@@ -0,0 +1,39 @@
# Xcode
.DS_Store
#build file
build/
profile
*.moved-aside
DerivedData
.idea/
*.xccheckout
*.xcuserstate
Thumbs.db
*.ipa
*.zip
## User settings
xcuserdata/
### SwiftPackageManager ###
Packages
xcuserdata
#CocoaPods
Pods
*.cer
*.mobileprovision
MiaPia.xcworkspace/xcuserdata/*
MiaPia.xcodeproj/xcuserdata/*
**__pycache__**
*.pyc
*.mp3
*.wav
*.png
*.blend

216
a2f_api.py Normal file
View File

@@ -0,0 +1,216 @@
import math
import os
import requests
from pprint import pprint
import argparse
import soundfile
class A2F(object):
ROOT_PATH = "/home/ubuntu/results"
BASE_PATH = os.path.expanduser("~/.local/share/ov/pkg/audio2face-2023.2.0/")
ASSETS_PATH = os.path.join(BASE_PATH, "exts/omni.audio2face.tool/deps/audio2face-assets")
CLAIRE_PATH = os.path.join(ASSETS_PATH, "claire/mesh/claire_fullface_model.usd")
MARK_PATH = os.path.join(ASSETS_PATH, "mark/mesh/mark_fullface_model.usd")
PLAYER_NAME = "/World/audio2face/Player"
FULLFACE_MODEL_NAME = "/World/audio2face/CoreFullface"
def __init__(self, url="http://localhost:8011/"):
self.url = url
def status(self):
resp = requests.get(f"{self.url}status")
return resp.json()
def get_emotion_names(self):
resp = requests.get(f"{self.url}A2F/A2E/GetEmotionNames")
return resp.json().get('result', [])
def get_scene_objects(self):
resp = requests.get(f"{self.url}A2F/GetInstances")
return resp.json().get('result', {})
def get_players(self):
resp = requests.get(f"{self.url}A2F/Player/GetInstances")
return resp.json().get('result', {})
def load_usd(self, usd_path):
resp = requests.post(f"{self.url}A2F/USD/Load",
json={
"file_name": usd_path,
})
return resp.json()
def load_claire(self):
print("Claire path: ", self.CLAIRE_PATH)
return self.load_usd(self.CLAIRE_PATH)
def load_mark(self):
print("Mark path: ", self.MARK_PATH)
return self.load_usd(self.MARK_PATH)
def openapi(self):
resp = requests.get(f"{self.url}openapi.json")
return resp.json()
def get_frame(self):
pass
def get_settings(self):
resp = requests.post(f"{self.url}A2F/GetSettings", json={
"a2f_instance": "",
})
return resp.json()
def get_player_root(self):
resp = requests.post(f"{self.url}A2F/Player/GetRootPath", json={
"a2f_player": self.PLAYER_NAME,
})
return resp.json()
def set_player_root(self, new_path):
resp = requests.post(f"{self.url}A2F/Player/SetRootPath", json={
"a2f_player": self.PLAYER_NAME,
"dir_path": new_path,
})
return resp.json()
def set_audio(self, audio_path):
duration = soundfile.info(audio_path).duration
print("Audio duration: ", duration)
resp = requests.post(f"{self.url}A2F/Player/SetTrack", json={
"a2f_player": self.PLAYER_NAME,
"file_name": audio_path,
"time_range": [
0,
duration
]
})
data = [resp.json()]
resp = requests.post(f"{self.url}A2F/Player/SetRange", json={
"a2f_player": self.PLAYER_NAME,
"start": 0,
"end": duration
})
data.append(resp.json())
resp = requests.post(f"{self.url}A2F/Player/GetTracks", json={
"a2f_player": self.PLAYER_NAME,
})
data.append(resp.json())
resp = requests.post(f"{self.url}A2F/Player/GetCurrentTrack", json={
"a2f_player": self.PLAYER_NAME,
})
data.append(resp.json())
return data
def get_audio_range(self):
resp = requests.post(f"{self.url}A2F/Player/GetRange", json={
"a2f_player": self.PLAYER_NAME,
})
return resp.json()
def run(self):
resp = requests.post(f"{self.url}A2F/A2E/GenerateKeys", json={
"a2f_instance": self.FULLFACE_MODEL_NAME,
})
return resp.json()
def get_number_of_keys(self):
resp = requests.post(f"{self.url}A2F/A2E/NumKeys", json={
"a2f_instance": self.FULLFACE_MODEL_NAME,
})
return resp.json()
def get_generated_keys(self):
resp = requests.post(f"{self.url}A2F/A2E/GetKeyData", json={
"a2f_instance": self.FULLFACE_MODEL_NAME,
})
return resp.json()
def get_a2e_settings(self):
resp = requests.post(f"{self.url}A2F/A2E/GetSettings", json={
"a2f_instance": self.FULLFACE_MODEL_NAME,
})
return resp.json()
def get_blendshape_solvers(self):
resp = requests.get(f"{self.url}A2F/Exporter/GetBlendShapeSolvers")
return resp.json()
def get_pre_settings(self):
resp = requests.post(f"{self.url}A2F/PRE/SetSettings", json={
"a2f_instance": self.FULLFACE_MODEL_NAME,
"prediction_delay": 0.01,
})
resp = requests.post(f"{self.url}A2F/PRE/GetSettings", json={
"a2f_instance": self.FULLFACE_MODEL_NAME,
})
return resp.json()
def get_post_settings(self):
resp = requests.post(f"{self.url}A2F/POST/GetSettings", json={
"a2f_instance": self.FULLFACE_MODEL_NAME,
})
return resp.json()
def export(self, export_path, filename):
resp = requests.post(f"{self.url}A2F/Exporter/ExportGeometryCache", json={
"export_directory": export_path,
"cache_type": "usd",
"xform_keys": False,
"batch": False,
"file_name": filename,
"fps": 0
})
try:
return resp.json()
except:
print(resp.content)
def export_json(self, export_path, filename):
resp = requests.post(f"{self.url}A2F/Exporter/ExportBlendshapes", json={
"export_directory": export_path,
"format": "json",
"batch": False,
"file_name": filename,
"fps": 0
})
try:
return resp.json()
except:
print(resp.content)
def upload(self, audio_path):
audio_path = os.path.abspath(audio_path)
fname = os.path.basename(audio_path)
os.system(f"cd ./server && ./send_file_to_gui.sh {audio_path} ../results/{fname}")
return os.path.join("/home/ubuntu/results", fname)
def pull(self, fname):
export_fname = os.path.splitext(fname)[0] + '.usd'
os.system(f"cd ./server && ./send_file_from_gui.sh ../results/{export_fname} ../")
def apply(self, audio_path):
fname = os.path.basename(audio_path)
print("Status: ", self.status())
print("EmotionNames: ", self.get_emotion_names())
print("Scene Objects: ", self.get_scene_objects())
print("Scene Players: ", self.get_players())
print("Preprocessing settings: ", self.get_pre_settings())
print("Postprocessing settings: ", self.get_post_settings())
print("Setting player root: ", self.set_player_root("/home/ubuntu/results"))
print("Player root: ", self.get_player_root())
print("Setting audio: ", self.set_audio(os.path.basename(audio_path)))
print("Audio Range: ", self.get_audio_range())
print("Running: ", self.run())
print("NumKeys: ", self.get_number_of_keys())
print("Keys: ", self.get_generated_keys())
print("Exporting: ", self.export_json("/home/ubuntu/results",
filename=os.path.splitext(fname)[0]))
def apply_stream(self, audio_path):
pass

50
audio2face.py Normal file
View File

@@ -0,0 +1,50 @@
# speech to Audio2Face module utilizing the gRPC protocal from audio2face_streaming_utils
import io
from pydub import AudioSegment
from scipy.io.wavfile import read
import numpy as np
from audio2face_streaming_utils import push_audio_track
class Audio2FaceService:
def __init__(self, sample_rate=44100):
"""
:param sample_rate: sample rate
"""
self.a2f_url = 'localhost:50051' # Set it to the port of your local host
self.sample_rate = 44100
self.avatar_instance = '/World/audio2face/PlayerStreaming' # Set it to the name of your Audio2Face Streaming Instance
def tts_to_wav(self, tts_byte, framerate=22050) -> str:
"""
:param tts_byte: tts data in byte
:param framerate: framerate
:return: wav byte
"""
seg = AudioSegment.from_raw(io.BytesIO(tts_byte), sample_width=2, frame_rate=22050, channels=1)
wavIO = io.BytesIO()
seg.export(wavIO, format="wav")
rate, wav = read(io.BytesIO(wavIO.getvalue()))
return wav
def wav_to_numpy_float32(self, wav_byte) -> float:
"""
:param wav_byte: wav byte
:return: float32
"""
return wav_byte.astype(np.float32, order='C') / 32768.0
def get_tts_numpy_audio(self, audio) -> float:
"""
:param audio: audio from tts_to_wav
:return: float32 of the audio
"""
wav_byte = self.tts_to_wav(audio)
return self.wav_to_numpy_float32(wav_byte)
def make_avatar_speaks(self, audio) -> None:
"""
:param audio: tts audio
:return: None
"""
push_audio_track(self.a2f_url, self.get_tts_numpy_audio(audio), self.sample_rate, self.avatar_instance)

502
audio2face_pb2.py Normal file
View File

@@ -0,0 +1,502 @@
# -*- coding: utf-8 -*-
# Generated by the protocol buffer compiler. DO NOT EDIT!
# source: audio2face.proto
"""Generated protocol buffer code."""
from google.protobuf import descriptor as _descriptor
from google.protobuf import message as _message
from google.protobuf import reflection as _reflection
from google.protobuf import symbol_database as _symbol_database
# @@protoc_insertion_point(imports)
_sym_db = _symbol_database.Default()
DESCRIPTOR = _descriptor.FileDescriptor(
name="audio2face.proto",
package="nvidia.audio2face",
syntax="proto3",
serialized_options=None,
create_key=_descriptor._internal_create_key,
serialized_pb=b'\n\x10\x61udio2face.proto\x12\x11nvidia.audio2face"{\n\x10PushAudioRequest\x12\x15\n\rinstance_name\x18\x01 \x01(\t\x12\x12\n\nsamplerate\x18\x02 \x01(\x05\x12\x12\n\naudio_data\x18\x03 \x01(\x0c\x12(\n block_until_playback_is_finished\x18\x04 \x01(\x08"5\n\x11PushAudioResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t"\x85\x01\n\x16PushAudioStreamRequest\x12@\n\x0cstart_marker\x18\x01 \x01(\x0b\x32(.nvidia.audio2face.PushAudioRequestStartH\x00\x12\x14\n\naudio_data\x18\x02 \x01(\x0cH\x00\x42\x13\n\x11streaming_request"l\n\x15PushAudioRequestStart\x12\x15\n\rinstance_name\x18\x01 \x01(\t\x12\x12\n\nsamplerate\x18\x02 \x01(\x05\x12(\n block_until_playback_is_finished\x18\x03 \x01(\x08";\n\x17PushAudioStreamResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t2\xd4\x01\n\nAudio2Face\x12X\n\tPushAudio\x12#.nvidia.audio2face.PushAudioRequest\x1a$.nvidia.audio2face.PushAudioResponse"\x00\x12l\n\x0fPushAudioStream\x12).nvidia.audio2face.PushAudioStreamRequest\x1a*.nvidia.audio2face.PushAudioStreamResponse"\x00(\x01\x62\x06proto3',
)
_PUSHAUDIOREQUEST = _descriptor.Descriptor(
name="PushAudioRequest",
full_name="nvidia.audio2face.PushAudioRequest",
filename=None,
file=DESCRIPTOR,
containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[
_descriptor.FieldDescriptor(
name="instance_name",
full_name="nvidia.audio2face.PushAudioRequest.instance_name",
index=0,
number=1,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=b"".decode("utf-8"),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="samplerate",
full_name="nvidia.audio2face.PushAudioRequest.samplerate",
index=1,
number=2,
type=5,
cpp_type=1,
label=1,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="audio_data",
full_name="nvidia.audio2face.PushAudioRequest.audio_data",
index=2,
number=3,
type=12,
cpp_type=9,
label=1,
has_default_value=False,
default_value=b"",
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="block_until_playback_is_finished",
full_name="nvidia.audio2face.PushAudioRequest.block_until_playback_is_finished",
index=3,
number=4,
type=8,
cpp_type=7,
label=1,
has_default_value=False,
default_value=False,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
],
extensions=[],
nested_types=[],
enum_types=[],
serialized_options=None,
is_extendable=False,
syntax="proto3",
extension_ranges=[],
oneofs=[],
serialized_start=39,
serialized_end=162,
)
_PUSHAUDIORESPONSE = _descriptor.Descriptor(
name="PushAudioResponse",
full_name="nvidia.audio2face.PushAudioResponse",
filename=None,
file=DESCRIPTOR,
containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[
_descriptor.FieldDescriptor(
name="success",
full_name="nvidia.audio2face.PushAudioResponse.success",
index=0,
number=1,
type=8,
cpp_type=7,
label=1,
has_default_value=False,
default_value=False,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="message",
full_name="nvidia.audio2face.PushAudioResponse.message",
index=1,
number=2,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=b"".decode("utf-8"),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
],
extensions=[],
nested_types=[],
enum_types=[],
serialized_options=None,
is_extendable=False,
syntax="proto3",
extension_ranges=[],
oneofs=[],
serialized_start=164,
serialized_end=217,
)
_PUSHAUDIOSTREAMREQUEST = _descriptor.Descriptor(
name="PushAudioStreamRequest",
full_name="nvidia.audio2face.PushAudioStreamRequest",
filename=None,
file=DESCRIPTOR,
containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[
_descriptor.FieldDescriptor(
name="start_marker",
full_name="nvidia.audio2face.PushAudioStreamRequest.start_marker",
index=0,
number=1,
type=11,
cpp_type=10,
label=1,
has_default_value=False,
default_value=None,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="audio_data",
full_name="nvidia.audio2face.PushAudioStreamRequest.audio_data",
index=1,
number=2,
type=12,
cpp_type=9,
label=1,
has_default_value=False,
default_value=b"",
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
],
extensions=[],
nested_types=[],
enum_types=[],
serialized_options=None,
is_extendable=False,
syntax="proto3",
extension_ranges=[],
oneofs=[
_descriptor.OneofDescriptor(
name="streaming_request",
full_name="nvidia.audio2face.PushAudioStreamRequest.streaming_request",
index=0,
containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[],
)
],
serialized_start=220,
serialized_end=353,
)
_PUSHAUDIOREQUESTSTART = _descriptor.Descriptor(
name="PushAudioRequestStart",
full_name="nvidia.audio2face.PushAudioRequestStart",
filename=None,
file=DESCRIPTOR,
containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[
_descriptor.FieldDescriptor(
name="instance_name",
full_name="nvidia.audio2face.PushAudioRequestStart.instance_name",
index=0,
number=1,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=b"".decode("utf-8"),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="samplerate",
full_name="nvidia.audio2face.PushAudioRequestStart.samplerate",
index=1,
number=2,
type=5,
cpp_type=1,
label=1,
has_default_value=False,
default_value=0,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="block_until_playback_is_finished",
full_name="nvidia.audio2face.PushAudioRequestStart.block_until_playback_is_finished",
index=2,
number=3,
type=8,
cpp_type=7,
label=1,
has_default_value=False,
default_value=False,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
],
extensions=[],
nested_types=[],
enum_types=[],
serialized_options=None,
is_extendable=False,
syntax="proto3",
extension_ranges=[],
oneofs=[],
serialized_start=355,
serialized_end=463,
)
_PUSHAUDIOSTREAMRESPONSE = _descriptor.Descriptor(
name="PushAudioStreamResponse",
full_name="nvidia.audio2face.PushAudioStreamResponse",
filename=None,
file=DESCRIPTOR,
containing_type=None,
create_key=_descriptor._internal_create_key,
fields=[
_descriptor.FieldDescriptor(
name="success",
full_name="nvidia.audio2face.PushAudioStreamResponse.success",
index=0,
number=1,
type=8,
cpp_type=7,
label=1,
has_default_value=False,
default_value=False,
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
_descriptor.FieldDescriptor(
name="message",
full_name="nvidia.audio2face.PushAudioStreamResponse.message",
index=1,
number=2,
type=9,
cpp_type=9,
label=1,
has_default_value=False,
default_value=b"".decode("utf-8"),
message_type=None,
enum_type=None,
containing_type=None,
is_extension=False,
extension_scope=None,
serialized_options=None,
file=DESCRIPTOR,
create_key=_descriptor._internal_create_key,
),
],
extensions=[],
nested_types=[],
enum_types=[],
serialized_options=None,
is_extendable=False,
syntax="proto3",
extension_ranges=[],
oneofs=[],
serialized_start=465,
serialized_end=524,
)
_PUSHAUDIOSTREAMREQUEST.fields_by_name["start_marker"].message_type = _PUSHAUDIOREQUESTSTART
_PUSHAUDIOSTREAMREQUEST.oneofs_by_name["streaming_request"].fields.append(
_PUSHAUDIOSTREAMREQUEST.fields_by_name["start_marker"]
)
_PUSHAUDIOSTREAMREQUEST.fields_by_name["start_marker"].containing_oneof = _PUSHAUDIOSTREAMREQUEST.oneofs_by_name[
"streaming_request"
]
_PUSHAUDIOSTREAMREQUEST.oneofs_by_name["streaming_request"].fields.append(
_PUSHAUDIOSTREAMREQUEST.fields_by_name["audio_data"]
)
_PUSHAUDIOSTREAMREQUEST.fields_by_name["audio_data"].containing_oneof = _PUSHAUDIOSTREAMREQUEST.oneofs_by_name[
"streaming_request"
]
DESCRIPTOR.message_types_by_name["PushAudioRequest"] = _PUSHAUDIOREQUEST
DESCRIPTOR.message_types_by_name["PushAudioResponse"] = _PUSHAUDIORESPONSE
DESCRIPTOR.message_types_by_name["PushAudioStreamRequest"] = _PUSHAUDIOSTREAMREQUEST
DESCRIPTOR.message_types_by_name["PushAudioRequestStart"] = _PUSHAUDIOREQUESTSTART
DESCRIPTOR.message_types_by_name["PushAudioStreamResponse"] = _PUSHAUDIOSTREAMRESPONSE
_sym_db.RegisterFileDescriptor(DESCRIPTOR)
PushAudioRequest = _reflection.GeneratedProtocolMessageType(
"PushAudioRequest",
(_message.Message,),
{
"DESCRIPTOR": _PUSHAUDIOREQUEST,
"__module__": "audio2face_pb2"
# @@protoc_insertion_point(class_scope:nvidia.audio2face.PushAudioRequest)
},
)
_sym_db.RegisterMessage(PushAudioRequest)
PushAudioResponse = _reflection.GeneratedProtocolMessageType(
"PushAudioResponse",
(_message.Message,),
{
"DESCRIPTOR": _PUSHAUDIORESPONSE,
"__module__": "audio2face_pb2"
# @@protoc_insertion_point(class_scope:nvidia.audio2face.PushAudioResponse)
},
)
_sym_db.RegisterMessage(PushAudioResponse)
PushAudioStreamRequest = _reflection.GeneratedProtocolMessageType(
"PushAudioStreamRequest",
(_message.Message,),
{
"DESCRIPTOR": _PUSHAUDIOSTREAMREQUEST,
"__module__": "audio2face_pb2"
# @@protoc_insertion_point(class_scope:nvidia.audio2face.PushAudioStreamRequest)
},
)
_sym_db.RegisterMessage(PushAudioStreamRequest)
PushAudioRequestStart = _reflection.GeneratedProtocolMessageType(
"PushAudioRequestStart",
(_message.Message,),
{
"DESCRIPTOR": _PUSHAUDIOREQUESTSTART,
"__module__": "audio2face_pb2"
# @@protoc_insertion_point(class_scope:nvidia.audio2face.PushAudioRequestStart)
},
)
_sym_db.RegisterMessage(PushAudioRequestStart)
PushAudioStreamResponse = _reflection.GeneratedProtocolMessageType(
"PushAudioStreamResponse",
(_message.Message,),
{
"DESCRIPTOR": _PUSHAUDIOSTREAMRESPONSE,
"__module__": "audio2face_pb2"
# @@protoc_insertion_point(class_scope:nvidia.audio2face.PushAudioStreamResponse)
},
)
_sym_db.RegisterMessage(PushAudioStreamResponse)
_AUDIO2FACE = _descriptor.ServiceDescriptor(
name="Audio2Face",
full_name="nvidia.audio2face.Audio2Face",
file=DESCRIPTOR,
index=0,
serialized_options=None,
create_key=_descriptor._internal_create_key,
serialized_start=527,
serialized_end=739,
methods=[
_descriptor.MethodDescriptor(
name="PushAudio",
full_name="nvidia.audio2face.Audio2Face.PushAudio",
index=0,
containing_service=None,
input_type=_PUSHAUDIOREQUEST,
output_type=_PUSHAUDIORESPONSE,
serialized_options=None,
create_key=_descriptor._internal_create_key,
),
_descriptor.MethodDescriptor(
name="PushAudioStream",
full_name="nvidia.audio2face.Audio2Face.PushAudioStream",
index=1,
containing_service=None,
input_type=_PUSHAUDIOSTREAMREQUEST,
output_type=_PUSHAUDIOSTREAMRESPONSE,
serialized_options=None,
create_key=_descriptor._internal_create_key,
),
],
)
_sym_db.RegisterServiceDescriptor(_AUDIO2FACE)
DESCRIPTOR.services_by_name["Audio2Face"] = _AUDIO2FACE
# @@protoc_insertion_point(module_scope)

122
audio2face_pb2_grpc.py Normal file
View File

@@ -0,0 +1,122 @@
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
"""Client and server classes corresponding to protobuf-defined services."""
import grpc
import audio2face_pb2 as audio2face__pb2
class Audio2FaceStub(object):
"""Missing associated documentation comment in .proto file."""
def __init__(self, channel):
"""Constructor.
Args:
channel: A grpc.Channel.
"""
self.PushAudio = channel.unary_unary(
"/nvidia.audio2face.Audio2Face/PushAudio",
request_serializer=audio2face__pb2.PushAudioRequest.SerializeToString,
response_deserializer=audio2face__pb2.PushAudioResponse.FromString,
)
self.PushAudioStream = channel.stream_unary(
"/nvidia.audio2face.Audio2Face/PushAudioStream",
request_serializer=audio2face__pb2.PushAudioStreamRequest.SerializeToString,
response_deserializer=audio2face__pb2.PushAudioStreamResponse.FromString,
)
class Audio2FaceServicer(object):
"""Missing associated documentation comment in .proto file."""
def PushAudio(self, request, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details("Method not implemented!")
raise NotImplementedError("Method not implemented!")
def PushAudioStream(self, request_iterator, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details("Method not implemented!")
raise NotImplementedError("Method not implemented!")
def add_Audio2FaceServicer_to_server(servicer, server):
rpc_method_handlers = {
"PushAudio": grpc.unary_unary_rpc_method_handler(
servicer.PushAudio,
request_deserializer=audio2face__pb2.PushAudioRequest.FromString,
response_serializer=audio2face__pb2.PushAudioResponse.SerializeToString,
),
"PushAudioStream": grpc.stream_unary_rpc_method_handler(
servicer.PushAudioStream,
request_deserializer=audio2face__pb2.PushAudioStreamRequest.FromString,
response_serializer=audio2face__pb2.PushAudioStreamResponse.SerializeToString,
),
}
generic_handler = grpc.method_handlers_generic_handler("nvidia.audio2face.Audio2Face", rpc_method_handlers)
server.add_generic_rpc_handlers((generic_handler,))
# This class is part of an EXPERIMENTAL API.
class Audio2Face(object):
"""Missing associated documentation comment in .proto file."""
@staticmethod
def PushAudio(
request,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None,
):
return grpc.experimental.unary_unary(
request,
target,
"/nvidia.audio2face.Audio2Face/PushAudio",
audio2face__pb2.PushAudioRequest.SerializeToString,
audio2face__pb2.PushAudioResponse.FromString,
options,
channel_credentials,
insecure,
call_credentials,
compression,
wait_for_ready,
timeout,
metadata,
)
@staticmethod
def PushAudioStream(
request_iterator,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None,
):
return grpc.experimental.stream_unary(
request_iterator,
target,
"/nvidia.audio2face.Audio2Face/PushAudioStream",
audio2face__pb2.PushAudioStreamRequest.SerializeToString,
audio2face__pb2.PushAudioStreamResponse.FromString,
options,
channel_credentials,
insecure,
call_credentials,
compression,
wait_for_ready,
timeout,
metadata,
)

View File

@@ -0,0 +1,142 @@
"""
This demo script shows how to send audio data to Audio2Face Streaming Audio Player via gRPC requests.
There are two options:
* Send the whole track at once using PushAudioRequest()
* Send the audio chunks seuqntially in a stream using PushAudioStreamRequest()
For the second option this script emulates the stream of chunks, generated by splitting an input WAV audio file.
But in a real application such stream of chunks may be aquired from some other streaming source:
* streaming audio via internet, streaming Text-To-Speech, etc
gRPC protocol details could be find in audio2face.proto
"""
import sys
import grpc
import time
import numpy as np
import soundfile
import audio2face_pb2
import audio2face_pb2_grpc
def push_audio_track(url, audio_data, samplerate, instance_name):
"""
This function pushes the whole audio track at once via PushAudioRequest()
PushAudioRequest parameters:
* audio_data: bytes, containing audio data for the whole track, where each sample is encoded as 4 bytes (float32)
* samplerate: sampling rate for the audio data
* instance_name: prim path of the Audio2Face Streaming Audio Player on the stage, were to push the audio data
* block_until_playback_is_finished: if True, the gRPC request will be blocked until the playback of the pushed track is finished
The request is passed to PushAudio()
"""
block_until_playback_is_finished = True # ADJUST
with grpc.insecure_channel(url) as channel:
stub = audio2face_pb2_grpc.Audio2FaceStub(channel)
request = audio2face_pb2.PushAudioRequest()
request.audio_data = audio_data.astype(np.float32).tobytes()
request.samplerate = samplerate
request.instance_name = instance_name
request.block_until_playback_is_finished = block_until_playback_is_finished
print("Sending audio data...")
response = stub.PushAudio(request)
if response.success:
print("SUCCESS")
else:
print(f"ERROR: {response.message}")
print("Closed channel")
def push_audio_track_stream(url, audio_data, samplerate, instance_name):
"""
This function pushes audio chunks sequentially via PushAudioStreamRequest()
The function emulates the stream of chunks, generated by splitting input audio track.
But in a real application such stream of chunks may be aquired from some other streaming source.
The first message must contain start_marker field, containing only meta information (without audio data):
* samplerate: sampling rate for the audio data
* instance_name: prim path of the Audio2Face Streaming Audio Player on the stage, were to push the audio data
* block_until_playback_is_finished: if True, the gRPC request will be blocked until the playback of the pushed track is finished (after the last message)
Second and other messages must contain audio_data field:
* audio_data: bytes, containing audio data for an audio chunk, where each sample is encoded as 4 bytes (float32)
All messages are packed into a Python generator and passed to PushAudioStream()
"""
chunk_size = samplerate // 10 # ADJUST
sleep_between_chunks = 0.04 # ADJUST
block_until_playback_is_finished = True # ADJUST
with grpc.insecure_channel(url) as channel:
print("Channel creadted")
stub = audio2face_pb2_grpc.Audio2FaceStub(channel)
def make_generator():
start_marker = audio2face_pb2.PushAudioRequestStart(
samplerate=samplerate,
instance_name=instance_name,
block_until_playback_is_finished=block_until_playback_is_finished,
)
# At first, we send a message with start_marker
yield audio2face_pb2.PushAudioStreamRequest(start_marker=start_marker)
# Then we send messages with audio_data
for i in range(len(audio_data) // chunk_size + 1):
time.sleep(sleep_between_chunks)
chunk = audio_data[i * chunk_size : i * chunk_size + chunk_size]
yield audio2face_pb2.PushAudioStreamRequest(audio_data=chunk.astype(np.float32).tobytes())
request_generator = make_generator()
print("Sending audio data...")
response = stub.PushAudioStream(request_generator)
if response.success:
print("SUCCESS")
else:
print(f"ERROR: {response.message}")
print("Channel closed")
def main():
"""
This demo script shows how to send audio data to Audio2Face Streaming Audio Player via gRPC requests.
There two options:
* Send the whole track at once using PushAudioRequest()
* Send the audio chunks seuqntially in a stream using PushAudioStreamRequest()
For the second option this script emulates the stream of chunks, generated by splitting an input WAV audio file.
But in a real application such stream of chunks may be aquired from some other streaming source:
* streaming audio via internet, streaming Text-To-Speech, etc
gRPC protocol details could be find in audio2face.proto
"""
if len(sys.argv) < 3:
print("Format: python test_client.py PATH_TO_WAV INSTANCE_NAME")
return
# Sleep time emulates long latency of the request
sleep_time = 2.0 # ADJUST
# URL of the Audio2Face Streaming Audio Player server (where A2F App is running)
url = "localhost:50051" # ADJUST
# Local input WAV file path
audio_fpath = sys.argv[1]
# Prim path of the Audio2Face Streaming Audio Player on the stage (were to push the audio data)
instance_name = sys.argv[2]
data, samplerate = soundfile.read(audio_fpath, dtype="float32")
# Only Mono audio is supported
if len(data.shape) > 1:
data = np.average(data, axis=1)
print(f"Sleeping for {sleep_time} seconds")
time.sleep(sleep_time)
if 0: # ADJUST
# Push the whole audio track at once
push_audio_track(url, data, samplerate, instance_name)
else:
# Emulate audio stream and push audio chunks sequentially
push_audio_track_stream(url, data, samplerate, instance_name)
if __name__ == "__main__":
main()

54
emotalk_own/Dockerfile Normal file
View File

@@ -0,0 +1,54 @@
FROM nvidia/cudagl:11.3.1-devel-ubuntu20.04
MAINTAINER "Jungwoo Choi"
ARG DEBIAN_FRONTEND=noninteractive
ENV TZ=Asia/Seoul
ADD requirements.txt /tmp/requirements.txt
RUN \
# Fix CUDA apt error
rm -f /etc/apt/sources.list.d/cuda.list && \
rm -f /etc/apt/sources.list.d/nvidia-ml.list && \
apt-get update && apt-get install -y gnupg2 software-properties-common && \
apt-key del 7fa2af80 && \
apt-get update && apt-get install -y --no-install-recommends wget && \
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb && \
dpkg -i cuda-keyring_1.0-1_all.deb && \
# Install Start
apt update && \
add-apt-repository -y ppa:savoury1/ffmpeg4 && \
apt -y install python3.8 python3.8-distutils libgl1-mesa-glx libglib2.0-0 git wget zsh vim openssh-server curl ffmpeg && \
# Python Library
update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \
wget https://bootstrap.pypa.io/get-pip.py && \
python3 get-pip.py && \
pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113 && \
pip install -r /tmp/requirements.txt && \
# zsh option
chsh -s /bin/zsh && \
sh -c "$(wget -O- https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)" && \
# add zsh-autosuggestions, zsh-syntax-highlighting plugin
git clone https://github.com/zsh-users/zsh-autosuggestions ~/.oh-my-zsh/custom/plugins/zsh-autosuggestions && \
git clone https://github.com/zsh-users/zsh-syntax-highlighting.git ~/.oh-my-zsh/custom/plugins/zsh-syntax-highlighting && \
# Modify .zshrc whth Perl
perl -pi -w -e 's/ZSH_THEME=.*/ZSH_THEME="af-magic"/g;' ~/.zshrc && \
perl -pi -w -e 's/plugins=.*/plugins=(git ssh-agent zsh-autosuggestions zsh-syntax-highlighting)/g;' ~/.zshrc && \
# Set ssh id and password, default is id = root, password = root.
# I recommand changing this for more security
# PermitRootLogin : yes - for ssh connection
echo 'root:root' |chpasswd && \
sed -ri 's/^#?PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && \
sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config && \
mkdir /root/.ssh && \
mkdir /var/run/sshd && \
# install language pack for timeline issue.
apt-get install -y language-pack-en && update-locale && \
# Clean up
apt-get clean && \
apt-get autoclean && \
apt-get autoremove -y && \
rm -rf /var/lib/cache/* && \
rm -rf /var/lib/log/*
WORKDIR /workspace
CMD ["echo", "nvidia/cudagl:11.3.1-devel-ubuntu20.04 is ready!", 'zsh']

13
emotalk_own/LICENSE Normal file
View File

@@ -0,0 +1,13 @@
Copyright (c) 2023 Psyche AI Inc.
This work is licensed under the Creative Commons Attribution-NonCommercial 4.0 International License (CC BY-NC 4.0). To view a copy of this license, visit http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, and distribute the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
1. Attribution — You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
2. NonCommercial — You may not use the material for commercial purposes.
3. No additional restrictions — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.

4
emotalk_own/blender.sh Executable file
View File

@@ -0,0 +1,4 @@
wget https://ftp.nluug.nl/pub/graphics/blender/release/Blender3.4/blender-3.4.1-linux-x64.tar.xz
tar -xf blender-3.4.1-linux-x64.tar.xz
mv blender-3.4.1-linux-x64 blender && rm blender-3.4.1-linux-x64.tar.xz

111
emotalk_own/demo.py Normal file
View File

@@ -0,0 +1,111 @@
import librosa
import numpy as np
import argparse
from scipy.signal import savgol_filter
import torch
from model import EmoTalk
import random
import os, subprocess
import shlex
@torch.no_grad()
def test(args):
result_path = args.result_path
os.makedirs(result_path, exist_ok=True)
eye1 = np.array([0.36537236, 0.950235724, 0.95593375, 0.916715622, 0.367256105, 0.119113259, 0.025357503])
eye2 = np.array([0.234776169, 0.909951985, 0.944758058, 0.777862132, 0.191071674, 0.235437036, 0.089163929])
eye3 = np.array([0.870040774, 0.949833691, 0.949418545, 0.695911646, 0.191071674, 0.072576277, 0.007108896])
eye4 = np.array([0.000307991, 0.556701422, 0.952656746, 0.942345619, 0.425857186, 0.148335218, 0.017659493])
model = EmoTalk(args)
model.load_state_dict(torch.load(args.model_path, map_location=torch.device(args.device)), strict=False)
model = model.to(args.device)
model.eval()
wav_path = args.wav_path
file_name = wav_path.split('/')[-1].split('.')[0]
speech_array, sampling_rate = librosa.load(os.path.join(wav_path), sr=16000)
audio = torch.FloatTensor(speech_array).unsqueeze(0).to(args.device)
level = torch.tensor([1]).to(args.device)
person = torch.tensor([0]).to(args.device)
prediction = model.predict(audio, level, person)
prediction = prediction.squeeze().detach().cpu().numpy()
if args.post_processing:
output = np.zeros((prediction.shape[0], prediction.shape[1]))
for i in range(prediction.shape[1]):
output[:, i] = savgol_filter(prediction[:, i], 5, 2)
output[:, 8] = 0
output[:, 9] = 0
i = random.randint(0, 60)
while i < output.shape[0] - 7:
eye_num = random.randint(1, 4)
if eye_num == 1:
output[i:i + 7, 8] = eye1
output[i:i + 7, 9] = eye1
elif eye_num == 2:
output[i:i + 7, 8] = eye2
output[i:i + 7, 9] = eye2
elif eye_num == 3:
output[i:i + 7, 8] = eye3
output[i:i + 7, 9] = eye3
else:
output[i:i + 7, 8] = eye4
output[i:i + 7, 9] = eye4
time1 = random.randint(60, 180)
i = i + time1
np.save(os.path.join(result_path, "{}.npy".format(file_name)), output) # with postprocessing (smoothing and blinking)
else:
np.save(os.path.join(result_path, "{}.npy".format(file_name)), prediction) # without post-processing
def render_video(args):
wav_name = args.wav_path.split('/')[-1].split('.')[0]
image_path = os.path.join(args.result_path, wav_name)
os.makedirs(image_path, exist_ok=True)
image_temp = image_path + "/%d.png"
output_path = os.path.join(args.result_path, wav_name + ".mp4")
blender_path = args.blender_path
python_path = "./render.py"
blend_path = "./render.blend"
cmd = '{} -t 64 -b {} -P {} -- "{}" "{}" '.format(blender_path, blend_path, python_path, args.result_path, wav_name)
cmd = shlex.split(cmd)
p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
while p.poll() is None:
line = p.stdout.readline()
line = line.strip()
if line:
print('[{}]'.format(line))
if p.returncode == 0:
print('Subprogram success')
else:
print('Subprogram failed')
cmd = 'ffmpeg -r 30 -i "{}" -i "{}" -pix_fmt yuv420p -s 512x768 "{}" -y'.format(image_temp, args.wav_path, output_path)
subprocess.call(cmd, shell=True)
cmd = 'rm -rf "{}"'.format(image_path)
subprocess.call(cmd, shell=True)
def main():
parser = argparse.ArgumentParser(
description='EmoTalk: Speech-driven Emotional Disentanglement for 3D Face Animation')
parser.add_argument("--wav_path", type=str, default="./audio/angry1.wav", help='path of the test data')
parser.add_argument("--bs_dim", type=int, default=52, help='number of blendshapes:52')
parser.add_argument("--feature_dim", type=int, default=832, help='number of feature dim')
parser.add_argument("--period", type=int, default=30, help='number of period')
parser.add_argument("--device", type=str, default="cuda", help='device')
parser.add_argument("--model_path", type=str, default="./pretrain_model/EmoTalk.pth",
help='path of the trained models')
parser.add_argument("--result_path", type=str, default="./result/", help='path of the result')
parser.add_argument("--max_seq_len", type=int, default=5000, help='max sequence length')
parser.add_argument("--num_workers", type=int, default=0)
parser.add_argument("--batch_size", type=int, default=1)
parser.add_argument("--post_processing", type=bool, default=True, help='whether to use post processing')
parser.add_argument("--blender_path", type=str, default="./blender/blender", help='path of blender')
args = parser.parse_args()
test(args)
render_video(args)
if __name__ == "__main__":
main()

144
emotalk_own/model.py Normal file
View File

@@ -0,0 +1,144 @@
import torch
import torch.nn as nn
import numpy as np
import math
from transformers import Wav2Vec2Processor, Wav2Vec2FeatureExtractor
from wav2vec import Wav2Vec2Model, Wav2Vec2ForSpeechClassification
from utils import init_biased_mask, enc_dec_mask
class EmoTalk(nn.Module):
def __init__(self, args):
super(EmoTalk, self).__init__()
self.feature_dim = args.feature_dim
self.bs_dim = args.bs_dim
self.device = args.device
self.batch_size = args.batch_size
self.audio_encoder_cont = Wav2Vec2Model.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
self.processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
self.audio_encoder_cont.feature_extractor._freeze_parameters()
self.audio_encoder_emo = Wav2Vec2ForSpeechClassification.from_pretrained(
"r-f/wav2vec-english-speech-emotion-recognition")
self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
"r-f/wav2vec-english-speech-emotion-recognition")
self.audio_encoder_emo.wav2vec2.feature_extractor._freeze_parameters()
self.max_seq_len = args.max_seq_len
self.audio_feature_map_cont = nn.Linear(1024, 512)
self.audio_feature_map_emo = nn.Linear(1024, 832)
self.audio_feature_map_emo2 = nn.Linear(832, 256)
self.relu = nn.ReLU()
self.biased_mask1 = init_biased_mask(n_head=4, max_seq_len=args.max_seq_len, period=args.period)
self.one_hot_level = np.eye(2)
self.obj_vector_level = nn.Linear(2, 32)
self.one_hot_person = np.eye(24)
self.obj_vector_person = nn.Linear(24, 32)
decoder_layer = nn.TransformerDecoderLayer(d_model=args.feature_dim, nhead=4, dim_feedforward=args.feature_dim,
batch_first=True)
self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=1)
self.bs_map_r = nn.Linear(self.feature_dim, self.bs_dim)
nn.init.constant_(self.bs_map_r.weight, 0)
nn.init.constant_(self.bs_map_r.bias, 0)
def forward(self, data):
frame_num11 = data["target11"].shape[1]
frame_num12 = data["target12"].shape[1]
inputs12 = self.processor(torch.squeeze(data["input12"]), sampling_rate=16000, return_tensors="pt",
padding="longest").input_values.to(self.device)
hidden_states_cont1 = self.audio_encoder_cont(inputs12, frame_num=frame_num11).last_hidden_state
hidden_states_cont12 = self.audio_encoder_cont(inputs12, frame_num=frame_num12).last_hidden_state
inputs21 = self.feature_extractor(torch.squeeze(data["input21"]), sampling_rate=16000, padding=True,
return_tensors="pt").input_values.to(self.device)
inputs12 = self.feature_extractor(torch.squeeze(data["input12"]), sampling_rate=16000, padding=True,
return_tensors="pt").input_values.to(self.device)
output_emo1 = self.audio_encoder_emo(inputs21, frame_num=frame_num11)
output_emo2 = self.audio_encoder_emo(inputs12, frame_num=frame_num12)
hidden_states_emo1 = output_emo1.hidden_states
hidden_states_emo2 = output_emo2.hidden_states
label1 = output_emo1.logits
onehot_level = self.one_hot_level[data["level"]]
onehot_level = torch.from_numpy(onehot_level).to(self.device).float()
onehot_person = self.one_hot_person[data["person"]]
onehot_person = torch.from_numpy(onehot_person).to(self.device).float()
if data["target11"].shape[0] == 1:
obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0)
obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0)
else:
obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0).permute(1, 0, 2)
obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0).permute(1, 0, 2)
obj_embedding_level11 = obj_embedding_level.repeat(1, frame_num11, 1)
obj_embedding_level12 = obj_embedding_level.repeat(1, frame_num12, 1)
obj_embedding_person11 = obj_embedding_person.repeat(1, frame_num11, 1)
obj_embedding_person12 = obj_embedding_person.repeat(1, frame_num12, 1)
hidden_states_cont1 = self.audio_feature_map_cont(hidden_states_cont1)
hidden_states_emo11_832 = self.audio_feature_map_emo(hidden_states_emo1)
hidden_states_emo11_256 = self.relu(self.audio_feature_map_emo2(hidden_states_emo11_832))
hidden_states11 = torch.cat(
[hidden_states_cont1, hidden_states_emo11_256, obj_embedding_level11, obj_embedding_person11], dim=2)
hidden_states_cont12 = self.audio_feature_map_cont(hidden_states_cont12)
hidden_states_emo12_832 = self.audio_feature_map_emo(hidden_states_emo2)
hidden_states_emo12_256 = self.relu(self.audio_feature_map_emo2(hidden_states_emo12_832))
hidden_states12 = torch.cat(
[hidden_states_cont12, hidden_states_emo12_256, obj_embedding_level12, obj_embedding_person12], dim=2)
if data["target11"].shape[0] == 1:
tgt_mask11 = self.biased_mask1[:, :hidden_states11.shape[1], :hidden_states11.shape[1]].clone().detach().to(
device=self.device)
tgt_mask22 = self.biased_mask1[:, :hidden_states12.shape[1], :hidden_states12.shape[1]].clone().detach().to(
device=self.device)
memory_mask11 = enc_dec_mask(self.device, hidden_states11.shape[1], hidden_states11.shape[1])
memory_mask12 = enc_dec_mask(self.device, hidden_states12.shape[1], hidden_states12.shape[1])
bs_out11 = self.transformer_decoder(hidden_states11, hidden_states_emo11_832, tgt_mask=tgt_mask11,
memory_mask=memory_mask11)
bs_out12 = self.transformer_decoder(hidden_states12, hidden_states_emo12_832, tgt_mask=tgt_mask22,
memory_mask=memory_mask12)
bs_output11 = self.bs_map_r(bs_out11)
bs_output12 = self.bs_map_r(bs_out12)
return bs_output11, bs_output12, label1
def predict(self, audio, level, person):
frame_num11 = math.ceil(audio.shape[1] / 16000 * 30)
inputs12 = self.processor(torch.squeeze(audio), sampling_rate=16000, return_tensors="pt",
padding="longest").input_values.to(self.device)
hidden_states_cont1 = self.audio_encoder_cont(inputs12, frame_num=frame_num11).last_hidden_state
inputs12 = self.feature_extractor(torch.squeeze(audio), sampling_rate=16000, padding=True,
return_tensors="pt").input_values.to(self.device)
output_emo1 = self.audio_encoder_emo(inputs12, frame_num=frame_num11)
hidden_states_emo1 = output_emo1.hidden_states
onehot_level = self.one_hot_level[level]
onehot_level = torch.from_numpy(onehot_level).to(self.device).float()
onehot_person = self.one_hot_person[person]
onehot_person = torch.from_numpy(onehot_person).to(self.device).float()
if audio.shape[0] == 1:
obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0)
obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0)
else:
obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0).permute(1, 0, 2)
obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0).permute(1, 0, 2)
obj_embedding_level11 = obj_embedding_level.repeat(1, frame_num11, 1)
obj_embedding_person11 = obj_embedding_person.repeat(1, frame_num11, 1)
hidden_states_cont1 = self.audio_feature_map_cont(hidden_states_cont1)
hidden_states_emo11_832 = self.audio_feature_map_emo(hidden_states_emo1)
hidden_states_emo11_256 = self.relu(
self.audio_feature_map_emo2(hidden_states_emo11_832))
hidden_states11 = torch.cat(
[hidden_states_cont1, hidden_states_emo11_256, obj_embedding_level11, obj_embedding_person11], dim=2)
if audio.shape[0] == 1:
tgt_mask11 = self.biased_mask1[:, :hidden_states11.shape[1],
:hidden_states11.shape[1]].clone().detach().to(device=self.device)
memory_mask11 = enc_dec_mask(self.device, hidden_states11.shape[1], hidden_states11.shape[1])
bs_out11 = self.transformer_decoder(hidden_states11, hidden_states_emo11_832, tgt_mask=tgt_mask11,
memory_mask=memory_mask11)
bs_output11 = self.bs_map_r(bs_out11)
return bs_output11

103
emotalk_own/readme.md Normal file
View File

@@ -0,0 +1,103 @@
![Psyche AI Inc release](./media/psy_logo.png)
# EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation [ICCV2023]
Official PyTorch implementation for the paper:
> **EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation**, ***ICCV 2023***.
>
> Ziqiao Peng, Haoyu Wu, Zhenbo Song, Hao Xu, Xiangyu Zhu, Jun He, Hongyan Liu, Zhaoxin Fan
>
> [Arxiv](https://arxiv.org/abs/2303.11089) | [Project Page](https://ziqiaopeng.github.io/emotalk/) | [License](https://github.com/psyai-net/EmoTalk_release/blob/main/LICENSE)
<p align="center">
<img src="./media/emotalk.png" width="90%" />
</p>
> Given audio input expressing different emotions, EmoTalk produces realistic 3D facial animation sequences with corresponding emotional expressions as outputs.
## News
- `2023.10.17` Thanks to [noirmist](https://github.com/noirmist)! Now you can create the environment via docker.
## Environment
- Linux
- Python 3.8.8
- Pytorch 1.12.1
- CUDA 11.3
- Blender 3.4.1
- ffmpeg 4.4.1
Clone the repo:
```bash
git clone https://github.com/psyai-net/EmoTalk_release.git
cd EmoTalk_release
```
Create conda environment:
```bash
conda create -n emotalk python=3.8.8
conda activate emotalk
pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113
pip install -r requirements.txt
```
## **Demo**
Download Blender and put it in this directory.
```bash
wget https://ftp.nluug.nl/pub/graphics/blender/release/Blender3.4/blender-3.4.1-linux-x64.tar.xz
tar -xf blender-3.4.1-linux-x64.tar.xz
mv blender-3.4.1-linux-x64 blender && rm blender-3.4.1-linux-x64.tar.xz
```
Download the pretrained models from [EmoTalk.pth](https://drive.google.com/file/d/1KQZ-WGI9VDFLqgNXvJQosKVCbjTaCPqK/view?usp=drive_link) (Updated). Put the pretrained models under `pretrain_model` folder.
Put the audio under `aduio` folder and run
```bash
python demo.py --wav_path "./audio/disgust.wav"
```
The generated animation will be saved in `result` folder.
## **Dataset**
If someone wants to download the 3D-ETF dataset, please fill in the [agreement](https://drive.google.com/file/d/1AQ5_focSgw9WiJdA2R44BQOrdTUe2ABd/view?usp=drive_link), and use the education mailbox to email Ziqiao Peng (pengziqiao@ruc.edu.cn) and cc Zhaoxin Fan (fanzhaoxin@psyai.net) to request the download link.
## **Citation**
If you find this work useful for your research, please cite our paper:
```
@InProceedings{Peng_2023_ICCV,
author = {Peng, Ziqiao and Wu, Haoyu and Song, Zhenbo and Xu, Hao and Zhu, Xiangyu and He, Jun and Liu, Hongyan and Fan, Zhaoxin},
title = {EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation},
booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
month = {October},
year = {2023},
pages = {20687-20697}
}
```
## **Acknowledgement**
Here are some great resources we benefit:
- [Faceformer](https://github.com/EvelynFan/FaceFormer) for training pipeline
- [EVP](https://github.com/jixinya/EVP) for training dataloader
- [Speech-driven-expressions](https://github.com/YoungSeng/Speech-driven-expressions) for rendering
- [Wav2Vec2 Content](https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-english) and [Wav2Vec2 Emotion](https://huggingface.co/r-f/wav2vec-english-speech-emotion-recognition) for audio encoder
- [Head Template](http://filmicworlds.com/blog/solving-face-scans-for-arkit/) for visualization.
Thanks to John Hable for sharing his head template under the CC0 license, which is very helpful for us to visualize the results.
## **Contact**
For research purpose, such as comparison of experimental results, please contact pengziqiao@ruc.edu.cn
For commercial licensing, please contact fanzhaoxin@psyai.net
## **License**
This project is licensed under the Creative Commons Attribution-NonCommercial 4.0 International License. Please read the [LICENSE](LICENSE) file for more information.
## **Invitation**
We invite you to join [Psyche AI Inc](https://www.psyai.com/home) to conduct cutting-edge research and business implementation together. At Psyche AI Inc, we are committed to pushing the boundaries of what's possible in the fields of artificial intelligence and computer vision, especially their applications in avatars. As a member of our team, you will have the opportunity to collaborate with talented individuals, innovate new ideas, and contribute to projects that have a real-world impact.
If you are passionate about working on the forefront of technology and making a difference, we would love to hear from you. Please visit our website at [Psyche AI Inc](https://www.psyai.com/home) to learn more about us and to apply for open positions. You can also contact us by fanzhaoxin@psyai.net.
Let's shape the future together!!

87
emotalk_own/render.py Normal file
View File

@@ -0,0 +1,87 @@
import bpy
import os
import numpy as np
import sys
filename = str(sys.argv[-1])
root_dir = str(sys.argv[-2])
model_bsList = ["browDownLeft",
"browDownRight",
"browInnerUp",
"browOuterUpLeft",
"browOuterUpRight",
"cheekPuff",
"cheekSquintLeft",
"cheekSquintRight",
"eyeBlinkLeft",
"eyeBlinkRight",
"eyeLookDownLeft",
"eyeLookDownRight",
"eyeLookInLeft",
"eyeLookInRight",
"eyeLookOutLeft",
"eyeLookOutRight",
"eyeLookUpLeft",
"eyeLookUpRight",
"eyeSquintLeft",
"eyeSquintRight",
"eyeWideLeft",
"eyeWideRight",
"jawForward",
"jawLeft",
"jawOpen",
"jawRight",
"mouthClose",
"mouthDimpleLeft",
"mouthDimpleRight",
"mouthFrownLeft",
"mouthFrownRight",
"mouthFunnel",
"mouthLeft",
"mouthLowerDownLeft",
"mouthLowerDownRight",
"mouthPressLeft",
"mouthPressRight",
"mouthPucker",
"mouthRight",
"mouthRollLower",
"mouthRollUpper",
"mouthShrugLower",
"mouthShrugUpper",
"mouthSmileLeft",
"mouthSmileRight",
"mouthStretchLeft",
"mouthStretchRight",
"mouthUpperUpLeft",
"mouthUpperUpRight",
"noseSneerLeft",
"noseSneerRight",
"tongueOut"]
obj = bpy.data.objects["face"]
bpy.context.scene.render.engine = 'BLENDER_WORKBENCH'
bpy.context.scene.display.shading.light = 'MATCAP'
bpy.context.scene.display.render_aa = 'FXAA'
bpy.context.scene.render.resolution_x = int(512)
bpy.context.scene.render.resolution_y = int(768)
bpy.context.scene.render.fps = 30
bpy.context.scene.render.image_settings.file_format = 'PNG'
cam = bpy.data.objects['Camera']
cam.scale = [2, 2, 2]
bpy.context.scene.camera = cam
output_dir = root_dir + filename
blendshape_path = root_dir + filename + '.npy'
result = []
bs = np.load(blendshape_path)
for i in range(bs.shape[0]):
curr_bs = bs[i]
for j in range(52):
obj.data.shape_keys.key_blocks[model_bsList[j]].value = curr_bs[j]
bpy.context.scene.render.filepath = os.path.join(output_dir, '{}.png'.format(i))
bpy.ops.render.render(write_still=True)

View File

@@ -0,0 +1,5 @@
numpy~=1.21.6
transformers~=4.26.0
tqdm~=4.64.1
librosa~=0.10.0
scipy~=1.9.1

39
emotalk_own/utils.py Normal file
View File

@@ -0,0 +1,39 @@
# Borrowed from https://github.com/EvelynFan/FaceFormer/blob/main/faceformer.py
import torch
import math
# Temporal Bias
def init_biased_mask(n_head, max_seq_len, period):
def get_slopes(n):
def get_slopes_power_of_2(n):
start = (2 ** (-2 ** -(math.log2(n) - 3)))
ratio = start
return [start * ratio ** i for i in range(n)]
if math.log2(n).is_integer():
return get_slopes_power_of_2(n)
else:
closest_power_of_2 = 2 ** math.floor(math.log2(n))
return get_slopes_power_of_2(closest_power_of_2) + get_slopes(2 * closest_power_of_2)[0::2][
:n - closest_power_of_2]
slopes = torch.Tensor(get_slopes(n_head))
bias = torch.arange(start=0, end=max_seq_len, step=period).unsqueeze(1).repeat(1, period).view(-1) // (period)
bias = - torch.flip(bias, dims=[0])
alibi = torch.zeros(max_seq_len, max_seq_len)
for i in range(max_seq_len):
alibi[i, :i + 1] = bias[-(i + 1):]
alibi = slopes.unsqueeze(1).unsqueeze(1) * alibi.unsqueeze(0)
mask = (torch.triu(torch.ones(max_seq_len, max_seq_len)) == 1).transpose(0, 1)
mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
mask = mask.unsqueeze(0) + alibi
return mask
# Alignment Bias
def enc_dec_mask(device, T, S):
mask = torch.ones(T, S).to(device)
for i in range(T):
mask[i, i] = 0
return (mask == 1).to(device=device)

245
emotalk_own/wav2vec.py Executable file
View File

@@ -0,0 +1,245 @@
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from dataclasses import dataclass
from transformers import Wav2Vec2Model, Wav2Vec2PreTrainedModel
from transformers.modeling_outputs import BaseModelOutput
from typing import Optional, Tuple
from transformers.file_utils import ModelOutput
from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
_CONFIG_FOR_DOC = "Wav2Vec2Config"
_HIDDEN_STATES_START_POSITION = 2
# the implementation of Wav2Vec2Model is borrowed from https://huggingface.co/transformers/_modules/transformers/models/wav2vec2/modeling_wav2vec2.html#Wav2Vec2Model
# initialize our encoder with the pre-trained wav2vec 2.0 weights.
def _compute_mask_indices(
shape: Tuple[int, int],
mask_prob: float,
mask_length: int,
attention_mask: Optional[torch.Tensor] = None,
min_masks: int = 0,
) -> np.ndarray:
bsz, all_sz = shape
mask = np.full((bsz, all_sz), False)
all_num_mask = int(
mask_prob * all_sz / float(mask_length)
+ np.random.rand()
)
all_num_mask = max(min_masks, all_num_mask)
mask_idcs = []
padding_mask = attention_mask.ne(1) if attention_mask is not None else None
for i in range(bsz):
if padding_mask is not None:
sz = all_sz - padding_mask[i].long().sum().item()
num_mask = int(
mask_prob * sz / float(mask_length)
+ np.random.rand()
)
num_mask = max(min_masks, num_mask)
else:
sz = all_sz
num_mask = all_num_mask
lengths = np.full(num_mask, mask_length)
if sum(lengths) == 0:
lengths[0] = min(mask_length, sz - 1)
min_len = min(lengths)
if sz - min_len <= num_mask:
min_len = sz - num_mask - 1
mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
mask_idc = np.asarray([mask_idc[j] + offset for j in range(len(mask_idc)) for offset in range(lengths[j])])
mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
min_len = min([len(m) for m in mask_idcs])
for i, mask_idc in enumerate(mask_idcs):
if len(mask_idc) > min_len:
mask_idc = np.random.choice(mask_idc, min_len, replace=False)
mask[i, mask_idc] = True
return mask
# linear interpolation layer
def linear_interpolation(features, input_fps, output_fps, output_len=None):
features = features.transpose(1, 2)
seq_len = features.shape[2] / float(input_fps)
if output_len is None:
output_len = int(seq_len * output_fps)
output_features = F.interpolate(features, size=output_len, align_corners=True, mode='linear')
return output_features.transpose(1, 2)
class Wav2Vec2Model(Wav2Vec2Model):
def __init__(self, config):
super().__init__(config)
self.lm_head = nn.Linear(1024, 32)
def forward(
self,
input_values,
attention_mask=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
frame_num=None
):
self.config.output_attentions = True
output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
output_hidden_states = (
output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
)
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
hidden_states = self.feature_extractor(input_values)
hidden_states = hidden_states.transpose(1, 2)
hidden_states = linear_interpolation(hidden_states, 50, 30, output_len=frame_num)
if attention_mask is not None:
output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
attention_mask = torch.zeros(
hidden_states.shape[:2], dtype=hidden_states.dtype, device=hidden_states.device
)
attention_mask[
(torch.arange(attention_mask.shape[0], device=hidden_states.device), output_lengths - 1)
] = 1
attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
hidden_states = self.feature_projection(hidden_states)[0]
encoder_outputs = self.encoder(
hidden_states,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = encoder_outputs[0]
if not return_dict:
return (hidden_states,) + encoder_outputs[1:]
return BaseModelOutput(
last_hidden_state=hidden_states,
hidden_states=encoder_outputs.hidden_states,
attentions=encoder_outputs.attentions,
)
@dataclass
class SpeechClassifierOutput(ModelOutput):
loss: Optional[torch.FloatTensor] = None
logits: torch.FloatTensor = None
hidden_states: Optional[Tuple[torch.FloatTensor]] = None
attentions: Optional[Tuple[torch.FloatTensor]] = None
class Wav2Vec2ClassificationHead(nn.Module):
"""Head for wav2vec classification task."""
def __init__(self, config):
super().__init__()
self.dense = nn.Linear(config.hidden_size, config.hidden_size)
self.dropout = nn.Dropout(config.final_dropout)
self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
def forward(self, features, **kwargs):
x = features
x = self.dropout(x)
x = self.dense(x)
x = torch.tanh(x)
x = self.dropout(x)
x = self.out_proj(x)
return x
class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
def __init__(self, config):
super().__init__(config)
self.num_labels = config.num_labels
self.pooling_mode = config.pooling_mode
self.config = config
self.wav2vec2 = Wav2Vec2Model(config)
self.classifier = Wav2Vec2ClassificationHead(config)
self.init_weights()
def freeze_feature_extractor(self):
self.wav2vec2.feature_extractor._freeze_parameters()
def merged_strategy(
self,
hidden_states,
mode="mean"
):
if mode == "mean":
outputs = torch.mean(hidden_states, dim=1)
elif mode == "sum":
outputs = torch.sum(hidden_states, dim=1)
elif mode == "max":
outputs = torch.max(hidden_states, dim=1)[0]
else:
raise Exception(
"The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")
return outputs
def forward(
self,
input_values,
attention_mask=None,
output_attentions=None,
output_hidden_states=None,
return_dict=None,
labels=None,
frame_num=None,
):
return_dict = return_dict if return_dict is not None else self.config.use_return_dict
outputs = self.wav2vec2(
input_values,
attention_mask=attention_mask,
output_attentions=output_attentions,
output_hidden_states=output_hidden_states,
return_dict=return_dict,
)
hidden_states = outputs[0]
hidden_states1 = linear_interpolation(hidden_states, 50, 30, output_len=frame_num)
hidden_states = self.merged_strategy(hidden_states1, mode=self.pooling_mode)
logits = self.classifier(hidden_states)
loss = None
if labels is not None:
if self.config.problem_type is None:
if self.num_labels == 1:
self.config.problem_type = "regression"
elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
self.config.problem_type = "single_label_classification"
else:
self.config.problem_type = "multi_label_classification"
if self.config.problem_type == "regression":
loss_fct = MSELoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels)
elif self.config.problem_type == "single_label_classification":
loss_fct = CrossEntropyLoss()
loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
elif self.config.problem_type == "multi_label_classification":
loss_fct = BCEWithLogitsLoss()
loss = loss_fct(logits, labels)
if not return_dict:
output = (logits,) + outputs[2:]
return ((loss,) + output) if loss is not None else output
return SpeechClassifierOutput(
loss=loss,
logits=logits,
hidden_states=hidden_states1,
attentions=outputs.attentions,
)

4
main.py Normal file
View File

@@ -0,0 +1,4 @@
from piedemo.web import Web
from piedemo.fields.ajax_group import AjaxChatField

View File

@@ -0,0 +1,164 @@
import random
import shutil
model_bsList = ["browDownLeft",
"browDownRight",
"browInnerUp",
"browOuterUpLeft",
"browOuterUpRight",
"cheekPuff",
"cheekSquintLeft",
"cheekSquintRight",
"eyeBlinkLeft",
"eyeBlinkRight",
"eyeLookDownLeft",
"eyeLookDownRight",
"eyeLookInLeft",
"eyeLookInRight",
"eyeLookOutLeft",
"eyeLookOutRight",
"eyeLookUpLeft",
"eyeLookUpRight",
"eyeSquintLeft",
"eyeSquintRight",
"eyeWideLeft",
"eyeWideRight",
"jawForward",
"jawLeft",
"jawOpen",
"jawRight",
"mouthClose",
"mouthDimpleLeft",
"mouthDimpleRight",
"mouthFrownLeft",
"mouthFrownRight",
"mouthFunnel",
"mouthLeft",
"mouthLowerDownLeft",
"mouthLowerDownRight",
"mouthPressLeft",
"mouthPressRight",
"mouthPucker",
"mouthRight",
"mouthRollLower",
"mouthRollUpper",
"mouthShrugLower",
"mouthShrugUpper",
"mouthSmileLeft",
"mouthSmileRight",
"mouthStretchLeft",
"mouthStretchRight",
"mouthUpperUpLeft",
"mouthUpperUpRight",
"noseSneerLeft",
"noseSneerRight",
"tongueOut"]
import bpy
import os
import numpy as np
import sys
filename = str(sys.argv[-1])
root_dir = str(sys.argv[-2])
object_name = "MFA_body"
obj = bpy.data.objects[object_name]
bpy.context.scene.render.engine = 'BLENDER_WORKBENCH'
bpy.context.scene.display.shading.light = 'MATCAP'
bpy.context.scene.display.render_aa = 'FXAA'
bpy.context.scene.render.resolution_x = int(512)
bpy.context.scene.render.resolution_y = int(768)
bpy.context.scene.render.fps = 30
bpy.context.scene.render.image_settings.file_format = 'PNG'
cam = bpy.data.objects['0Camera']
cam.scale = [2, 2, 2]
bpy.context.scene.camera = cam
"""
model_bsList = ['Basis',
'0',
'X_postrig',
'X_neck',
'X_head',
'X_eyesfix',
'X_breast',
'X_nails',
'X_pus_conf.1',
'X_pus_assym', 'X_jadafication',
'X_facetweak', 'X_eyeshape',
'A_nipple_in', 'A_nailsmax',
'A_pregnant', 'PAD_breathe',
'PAD_swallow', 'Head',
'cr_neck1', 'cr_neck2',
'cr_neck3.R', 'cr_neck3.L',
'cr_neck4.L', 'cr_neck4.R', 'cr_jaw1', 'cr_jaw2', 'sqz_jaw3', 'cr_brows_dwn', 'cr_brows_up',
'cr_eye_lookdown', 'cr_eye_open',
'cr_eye_look.L', 'cr_eye_look.R', 'cr_mouthmax.L', 'cr_mouthmax.R', 'cr_cheekin.L', 'cr_cheekin.R', 'Body', 'cr_spine',
'cr_spine2', 'cr_spine3', 'cr_spine2.L',
'cr_spine2.R', 'cr_spine4.L', 'cr_spine4.R',
'cr_spine5.L', 'cr_spine5.R', 'cr_lowerspine.bcw',
'cr_lowerspine.fwd', 'size_breastXL.L', 'size_breastXL.R',
'size_breastXS.L', 'size_breastXS.R', 'size_oreola.L',
'size_oreola.R', 'Legs', 'cr_hipout.L', 'cr_hipout.R',
'cr_hipin.L', 'cr_hipin.R', 'cr_pussyflattern',
'cr_hip0.L', 'cr_hip0.R', 'cr_hip1.L', 'cr_hip1.R',
'cr_hip45.L', 'cr_hip45.R', 'sqz_hip1max.L',
'sqz_hip1max.R', 'sqz_hip1vol.L', 'sqz_hip1vol.R',
'sqz_hip1squeeze.L', 'sqz_hip1squeeze.R', 'cr_hip2.L',
'cr_hip2.R', 'sqz_hip2.L', 'sqz_hip2.R', 'cr_hip3.L',
'cr_hip3.R', 'sqz_buttrest.L', 'sqz_buttrest.R',
'cr_knee45.L', 'cr_knee45.R', 'cr_knee.L', 'cr_knee.R',
'sqz_knee.L', 'sqz_knee.R', 'sqz_stance.L', 'sqz_stance.R',
'cr_buttheart.L', 'cr_buttheart.R', 'rest_buttcheek.L',
'rest_buttcheek.R', 'rest_knee.L', 'rest_knee.R', 'rest_knee_fat.L',
'rest_knee_fat.R', 'rest_hip.L', 'rest_hip.R', 'vol_butt.L',
'vol_butt.R', 'Feet', 'cr_feet1.L', 'cr_feet1.R', 'cr_feet2.L',
'cr_feet2.R', 'cr_feet3.L', 'cr_feet3.R', 'cr_toe1.L', 'cr_toe1.R',
'cr_toe2.L', 'cr_toe2.R', 'Arms', 'cr_arm-up.L', 'cr_arm-up.R',
'cr_arm-fwd.L', 'cr_arm-fwd.R', 'cr_arm-dwn.L', 'cr_arm-dwn.R',
'sqz_arm-fwd.L', 'sqz_arm-fwd.R', 'sqz_armpit.L', 'sqz_armpit.R',
'sqz_arm-bcw.L', 'sqz_arm-bcw.R', 'sqz_arm-bcw_max.L',
'sqz_arm-bcw_max.R', 'cr_arm-trc.L', 'cr_arm-trc.R',
'D_cr_elbow.L', 'U_cr_elbow.L', 'D_cr_elbow.R', 'U_cr_elbow.R',
'D_sqz_elbowMax.L', 'U_sqz_elbowMax.L', 'D_sqz_elbowMax.R',
'U_sqz_elbowMax.R', 'cr_armrest.L', 'cr_armrest.R',
'cr_shoulder_fwd.L', 'cr_shoulder_fwd.R', 'cr_shoulder_bcw.L',
'cr_shoulder_bcw.R', 'cr_shoulder_dwn.L', 'cr_shoulder_dwn.R',
'cr_shoulder_up.L', 'cr_shoulder_up.R', 'rest_elbow.L', 'rest_elbow.R',
'Hands', 'cr_hand1.L', 'cr_hand1.R',
'cr_hand2.L', 'cr_hand2.R', 'cr_handtwistU.L', 'cr_handtwistU.R',
'cr_handtwistD.L',
'cr_handtwistD.R',
'cr_thumb.01.L', 'cr_thumb.01.R',
'cr_f_index.01.L', 'cr_f_index.01.R', 'cr_f_index.02.L',
'cr_f_index.02.R',
'cr_f_middle.01.L', 'cr_f_middle.01.R', 'cr_f_middle.02.L',
'cr_f_middle.02.R', 'cr_f_ring.01.L', 'cr_f_ring.01.R',
'cr_f_ring.02.L', 'cr_f_ring.02.R', 'cr_f_pinky.01.L',
'cr_f_pinky.01.R', 'cr_f_pinky.02.L', 'cr_f_pinky.02.R', 'EM',
'em_eye_close.L', 'em_eye_close.R', 'em_eye_half.L', 'em_eye_half.R',
'em_smile_open', 'em_smile_close', 'em_kiss', 'em_disg', 'em_blow',
'em_surprise', 'em_sad', 'em_frown', 'PH', 'ph_+', 'ph_bpm',
'ph_fv', 'ph_ou',
'ph_e', 'ph_r', 'ph_ch', 'ph_th', 'ph_a']"""
model_bsList = list(obj.data.shape_keys.key_blocks.keys())
# print(obj.data.shape_keys.key_blocks.keys())
output_dir = root_dir + filename
blendshape_path = root_dir + filename + '.npy'
result = []
bs = np.load(blendshape_path)
for i in range(10):
for kp_name in model_bsList:
obj.data.shape_keys.key_blocks[kp_name].value = random.random()
bpy.context.scene.render.filepath = os.path.join(output_dir,
'{}.png'.format(i))
bpy.ops.render.render(write_still=True)

0
miapia_own/__init__.py Normal file
View File

57
miapia_own/a.py Normal file
View File

@@ -0,0 +1,57 @@
import bpy
import os
import numpy as np
import sys
filename = str(sys.argv[-1])
root_dir = str(sys.argv[-2])
object_name = "MFA_body"
obj = bpy.data.objects[object_name]
bpy.context.scene.render.engine = 'BLENDER_WORKBENCH'
bpy.context.scene.display.shading.light = 'MATCAP'
bpy.context.scene.display.render_aa = 'FXAA'
bpy.context.scene.render.resolution_x = int(512)
bpy.context.scene.render.resolution_y = int(768)
bpy.context.scene.render.fps = 30
bpy.context.scene.render.image_settings.file_format = 'PNG'
cam = bpy.data.objects['0Camera']
cam.scale = [2, 2, 2]
bpy.context.scene.camera = cam
model_bsList = ['Basis',
'0',
'X_postrig',
'X_neck',
'X_head',
'X_eyesfix',
'X_breast',
'X_nails',
'X_pus_conf.1',
'X_pus_assym', 'X_jadafication',
'X_facetweak', 'X_eyeshape',
'A_nipple_in', 'A_nailsmax',
'A_pregnant', 'PAD_breathe',
'PAD_swallow', 'Head',
'cr_neck1', 'cr_neck2',
'cr_neck3.R', 'cr_neck3.L',
'cr_neck4.L', 'cr_neck4.R', 'cr_jaw1', 'cr_jaw2', 'sqz_jaw3', 'cr_brows_dwn', 'cr_brows_up',
'cr_eye_lookdown', 'cr_eye_open',
'cr_eye_look.L', 'cr_eye_look.R', 'cr_mouthmax.L', 'cr_mouthmax.R', 'cr_cheekin.L', 'cr_cheekin.R', 'Body', 'cr_spine', 'cr_spine2', 'cr_spine3', 'cr_spine2.L', 'cr_spine2.R', 'cr_spine4.L', 'cr_spine4.R', 'cr_spine5.L', 'cr_spine5.R', 'cr_lowerspine.bcw', 'cr_lowerspine.fwd', 'size_breastXL.L', 'size_breastXL.R', 'size_breastXS.L', 'size_breastXS.R', 'size_oreola.L', 'size_oreola.R', 'Legs', 'cr_hipout.L', 'cr_hipout.R', 'cr_hipin.L', 'cr_hipin.R', 'cr_pussyflattern', 'cr_hip0.L', 'cr_hip0.R', 'cr_hip1.L', 'cr_hip1.R', 'cr_hip45.L', 'cr_hip45.R', 'sqz_hip1max.L', 'sqz_hip1max.R', 'sqz_hip1vol.L', 'sqz_hip1vol.R', 'sqz_hip1squeeze.L', 'sqz_hip1squeeze.R', 'cr_hip2.L', 'cr_hip2.R', 'sqz_hip2.L', 'sqz_hip2.R', 'cr_hip3.L', 'cr_hip3.R', 'sqz_buttrest.L', 'sqz_buttrest.R', 'cr_knee45.L', 'cr_knee45.R', 'cr_knee.L', 'cr_knee.R', 'sqz_knee.L', 'sqz_knee.R', 'sqz_stance.L', 'sqz_stance.R', 'cr_buttheart.L', 'cr_buttheart.R', 'rest_buttcheek.L', 'rest_buttcheek.R', 'rest_knee.L', 'rest_knee.R', 'rest_knee_fat.L', 'rest_knee_fat.R', 'rest_hip.L', 'rest_hip.R', 'vol_butt.L', 'vol_butt.R', 'Feet', 'cr_feet1.L', 'cr_feet1.R', 'cr_feet2.L', 'cr_feet2.R', 'cr_feet3.L', 'cr_feet3.R', 'cr_toe1.L', 'cr_toe1.R', 'cr_toe2.L', 'cr_toe2.R', 'Arms', 'cr_arm-up.L', 'cr_arm-up.R', 'cr_arm-fwd.L', 'cr_arm-fwd.R', 'cr_arm-dwn.L', 'cr_arm-dwn.R', 'sqz_arm-fwd.L', 'sqz_arm-fwd.R', 'sqz_armpit.L', 'sqz_armpit.R', 'sqz_arm-bcw.L', 'sqz_arm-bcw.R', 'sqz_arm-bcw_max.L', 'sqz_arm-bcw_max.R', 'cr_arm-trc.L', 'cr_arm-trc.R', 'D_cr_elbow.L', 'U_cr_elbow.L', 'D_cr_elbow.R', 'U_cr_elbow.R', 'D_sqz_elbowMax.L', 'U_sqz_elbowMax.L', 'D_sqz_elbowMax.R', 'U_sqz_elbowMax.R', 'cr_armrest.L', 'cr_armrest.R', 'cr_shoulder_fwd.L', 'cr_shoulder_fwd.R', 'cr_shoulder_bcw.L', 'cr_shoulder_bcw.R', 'cr_shoulder_dwn.L', 'cr_shoulder_dwn.R', 'cr_shoulder_up.L', 'cr_shoulder_up.R', 'rest_elbow.L', 'rest_elbow.R', 'Hands', 'cr_hand1.L', 'cr_hand1.R', 'cr_hand2.L', 'cr_hand2.R', 'cr_handtwistU.L', 'cr_handtwistU.R', 'cr_handtwistD.L', 'cr_handtwistD.R', 'cr_thumb.01.L', 'cr_thumb.01.R', 'cr_f_index.01.L', 'cr_f_index.01.R', 'cr_f_index.02.L', 'cr_f_index.02.R', 'cr_f_middle.01.L', 'cr_f_middle.01.R', 'cr_f_middle.02.L', 'cr_f_middle.02.R', 'cr_f_ring.01.L', 'cr_f_ring.01.R', 'cr_f_ring.02.L', 'cr_f_ring.02.R', 'cr_f_pinky.01.L', 'cr_f_pinky.01.R', 'cr_f_pinky.02.L', 'cr_f_pinky.02.R', 'EM', 'em_eye_close.L', 'em_eye_close.R', 'em_eye_half.L', 'em_eye_half.R', 'em_smile_open', 'em_smile_close', 'em_kiss', 'em_disg', 'em_blow', 'em_surprise', 'em_sad', 'em_frown', 'PH', 'ph_+', 'ph_bpm', 'ph_fv', 'ph_ou', 'ph_e', 'ph_r', 'ph_ch', 'ph_th', 'ph_a']
# print(obj.data.shape_keys.key_blocks.keys())
output_dir = root_dir + filename
blendshape_path = root_dir + filename + '.npy'
result = []
bs = np.load(blendshape_path)
for i in range(bs.shape[0]):
obj.data.shape_keys.key_blocks['cr_eye_open'].value = i / bs.shape[0]
bpy.context.scene.render.filepath = os.path.join(output_dir,
'{}.png'.format(i))
bpy.ops.render.render(write_still=True)

36
miapia_own/aihandler.py Normal file
View File

@@ -0,0 +1,36 @@
import requests
class AIHandler(object):
def __init__(self):
pass
def __call__(self, text):
resp = requests.post("https://fast-pia.avemio.technology/chat-completion",
json={
"session-id": "chatcmpl",
"user-location": "Zweibrücken",
"wheel-of-life": [
{
"personal_growth": 10,
"health_exercise": 5,
"familiy_friends": 5,
"romance_relationship": 5,
"career_work": 5,
"finances": 5,
"recreation_fun": 5,
"living_situation": 5}
],
"messages": [
{
"role": "user",
"content": text
}
]
})
resp = resp.json()
return {
"text": resp[0]['text'],
"emotion": resp[0]['emotion']
}

View File

@@ -0,0 +1,36 @@
from time import time
import openai
openai.api_key = "sk-proj-PdTVVVvYzcd6vs2qcRxpT3BlbkFJtq78XfSrzwEK2fqyOVHE"
import requests
class AIHandlerStream(object):
def __init__(self):
self.ai = openai.OpenAI(api_key="sk-proj-GaouEG2QuAcfAr1an2uBT3BlbkFJaIh0XVFXWrYQpJazlbeO")
def __call__(self, text):
out = ""
for chunk in self.ai.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "system", "content": "You are PIA. You talk with short sentences. And help people."},
{"role": "user", "content": text}
], stream=True
):
delta = chunk.choices[0].delta.content
if delta is None:
continue
out += delta
if len(out) > 0 and out[-1] in ['.', '!', ',', '?']:
yield out
out = ""
if len(out) > 0:
yield out
if __name__ == "__main__":
aihandler = AIHandlerStream()
t1 = time()
for text in aihandler("Hello, how are you, what is your name?"):
print(time() - t1)
print(text)

571
miapia_own/main.py Normal file
View File

@@ -0,0 +1,571 @@
import json
import sys
import re
from time import sleep, time
import logging
from collections import defaultdict
import pandas as pd
from flask import redirect
import argparse
import base64
from flask import send_file, Response, request, jsonify
from flask_socketio import emit
from piedemo.fields.ajax_group import AjaxChatField, AjaxGroup
from piedemo.fields.grid import VStack, HStack, SpaceField
from piedemo.fields.inputs.hidden import InputHiddenField
from piedemo.fields.outputs.colored_text import ptext, OutputColoredTextField
from piedemo.fields.outputs.json import OutputJSONField
from piedemo.fields.outputs.progress import ProgressField
from piedemo.fields.outputs.video import OutputVideoField
from piedemo.hub.swagger_utils.method import describe, check_missing_keys
from piedemo.web import Web
import os
import io
from piedemo.page import Page
from piedemo.hub.svgpil import SVGImage
from piedemo.fields.outputs.table import OutputTableField
from piedemo.fields.inputs.int_list import InputIntListField
from piedemo.fields.navigation import Navigation
from piedemo.fields.inputs.chat import ChatField
import librosa
import uuid
import numpy as np
import redis
import argparse
from scipy.signal import savgol_filter
import torch
import random
import os, subprocess
import shlex
import uuid
from tqdm import tqdm
from aihandler import AIHandler
from aihandler_stream import AIHandlerStream
from pieinfer import PieInfer, render_video, construct_video
import torch
from TTS.api import TTS
logging.getLogger('socketio').setLevel(logging.ERROR)
logging.getLogger('engineio').setLevel(logging.ERROR)
target_names = [
"mouthSmileLeft",
"mouthSmileRight",
"mouthStretchLeft",
"mouthStretchRight",
"mouthUpperUpLeft",
"mouthUpperUpRight",
]
model_bsList = ["browDownLeft",
"browDownRight",
"browInnerUp",
"browOuterUpLeft",
"browOuterUpRight",
"cheekPuff",
"cheekSquintLeft",
"cheekSquintRight",
"eyeBlinkLeft",
"eyeBlinkRight",
"eyeLookDownLeft",
"eyeLookDownRight",
"eyeLookInLeft",
"eyeLookInRight",
"eyeLookOutLeft",
"eyeLookOutRight",
"eyeLookUpLeft",
"eyeLookUpRight",
"eyeSquintLeft",
"eyeSquintRight",
"eyeWideLeft",
"eyeWideRight",
"jawForward",
"jawLeft",
"jawOpen",
"jawRight",
"mouthClose",
"mouthDimpleLeft",
"mouthDimpleRight",
"mouthFrownLeft",
"mouthFrownRight",
"mouthFunnel",
"mouthLeft",
"mouthLowerDownLeft",
"mouthLowerDownRight",
"mouthPressLeft",
"mouthPressRight",
"mouthPucker",
"mouthRight",
"mouthRollLower",
"mouthRollUpper",
"mouthShrugLower",
"mouthShrugUpper",
"mouthSmileLeft",
"mouthSmileRight",
"mouthStretchLeft",
"mouthStretchRight",
"mouthUpperUpLeft",
"mouthUpperUpRight",
"noseSneerLeft",
"noseSneerRight",
"tongueOut"]
# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"
blendshapes_path = "./blendshapes"
def get_asset(fname):
return SVGImage.open(os.path.join(os.path.dirname(__file__),
"assets",
fname)).svg_content
class MainPage(Page):
def __init__(self, model_name: str):
super(MainPage, self).__init__()
self.infer = PieInfer()
self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
self.r = redis.Redis(host='localhost', port=6379, decode_responses=True)
self.aihandler = AIHandler()
self.aihandler_stream = AIHandlerStream()
self.fields = Navigation(AjaxGroup("ChatGroup", VStack([
HStack([
AjaxChatField("Chat",
self.register_ajax(f"/refresh_{model_name}",
self.message_sent),
deps_names=["sid",
"session_id",
"Chat",
"Chat__piedemo__file"],
use_socketio_support=True,
nopie=True,
style={
"height": "100%"
}),
OutputColoredTextField("video",
nopie=True,
use_socketio_support=True),
], xs=[8, 4]),
ProgressField("progress",
nopie=True,
use_socketio_support=True),
InputHiddenField("session_id", None),
]), no_return=True), no_submit=True, page_title="MIA PIA", page_style={
})
self.fields.add_link("SIMPLE",
"/simple",
active=model_name == "render")
self.fields.add_link("MIA PIA",
"/nice",
active=model_name != "render")
self.model_name = model_name
def get_content(self, **kwargs):
fields = self.fields.copy()
fields.child_loc["Chat"].set_default_options(["Hello! What is your name?", "Say one word and stop."])
"""
fields.child_loc["Chat"].set_avatars({
"self": get_asset("avatar.svg"),
"ChatGPT": get_asset("dog.svg"),
})
"""
session_id = str(uuid.uuid4())
return self.fill(fields, {
"video": f"""
""",
"session_id": session_id,
})
def message_sent(self, **data):
sid = data['sid']
self.emit(self.fields.child_loc["Chat"].clear_input(),
to=sid)
self.emit(self.fields.child_loc["video"].update(f"""
"""))
data = self.parse(self.fields, data)
session_id = data['session_id']
messages_map = self.r.hgetall(f'user-session:{session_id}')
messages = [self.fields.child_loc["Chat"].format_message("self" if i % 2 == 0 else "ChatGPT",
messages_map[f"message_{i}"])
for i in range(len(messages_map))]
print("history: ", messages)
text = data['Chat']['text']
self.emit(self.fields.child_loc["Chat"].update(messages + [
self.fields.child_loc["Chat"].format_message("self", text),
self.fields.child_loc["Chat"].format_message("ChatGPT", "Generating text..."),
]), to=sid)
output = self.aihandler(text)
output_text = output['text']
output_emotion = output['emotion']
messages_map[f"message_{len(messages)}"] = text
messages_map[f"message_{len(messages) + 1}"] = output_text
self.r.hset(f'user-session:{session_id}', mapping=messages_map)
self.emit(self.fields.child_loc["Chat"].update(messages + [
self.fields.child_loc["Chat"].format_message("self", text),
self.fields.child_loc["Chat"].format_message("ChatGPT", "Generating audio..."),
]), to=sid)
self.tts.tts_to_file(text=output_text,
speaker_wav="/home/ubuntu/repo/of_couse_here.wav",
language="en",
emotion=output_emotion,
file_path=f"./audio/{session_id}.wav")
speech_array, sampling_rate = librosa.load(f"./audio/{session_id}.wav",
sr=16000)
output = self.infer(speech_array, sampling_rate)
np.save(os.path.join("./audio", "{}.npy".format(session_id)),
output)
self.emit(self.fields.child_loc["Chat"].update(messages + [
self.fields.child_loc["Chat"].format_message("self", text),
self.fields.child_loc["Chat"].format_message("ChatGPT", "Rendering..."),
]), to=sid)
n = output.shape[0]
for i, fname in enumerate(tqdm(render_video(f"{session_id}",
model_name=self.model_name),
total=n)):
print("Got frame: ", fname, file=sys.stderr)
self.emit(self.fields.child_loc["progress"].update(100 * i // n),
to=sid)
construct_video(session_id)
self.emit(self.fields.child_loc["video"].update(f"""
<video controls="1" autoplay="1" name="media" style="border-radius: 12px; height: 80%">
<source src="/api/video/{session_id}" type="video/mp4">
</video>
"""), to=sid)
'''self.emit(self.fields.child_loc["video"].update(f"""
<img name="media" style="border-radius: 12px; height: 80%" src="/api/video/stream/{session_id}"></img>
"""))'''
self.emit(self.fields.child_loc["Chat"].update(messages + [
self.fields.child_loc["Chat"].format_message("self", text),
self.fields.child_loc["Chat"].format_message("ChatGPT", output_text),
]), to=sid)
page = MainPage("render")
web = Web({
"": "simple",
"simple": page,
"nice": page,
}, use_socketio_support=True)
host = '0.0.0.0'
port = 8011
debug = False
app = web.get_app()
@app.route("/api/video/<session_id>", methods=["GET"])
def get_video(session_id):
return send_file("./audio/{}.mp4".format(session_id))
def gen(session_id):
for image_path in render_video(f"{session_id}"):
with open(image_path, 'rb') as f:
yield (b'--frame\r\n'
b'Content-Type: image/jpeg\r\n\r\n' + f.read() + b'\r\n')
construct_video(session_id)
@app.route("/api/video/stream/<session_id>", methods=["GET"])
def get_video_async(session_id):
return Response(gen(session_id),
mimetype='multipart/x-mixed-replace; boundary=frame')
speaker_path = "/home/ubuntu/repo/female.wav"
@app.route("/api/set_speaker", methods=["POST"])
@describe(["3dmodel"],
name="Set emotion for 3D model",
description="""Set speaker for 3D model""",
inputs={
"user_id": "This ID from article Unique Identifier for iPHONE",
"speaker": "voice1 or voice2"
},
outputs={
"status": "ok"
})
@check_missing_keys([
("user_id", {"status": "error", "status_code": "missing_user_id_error"}),
("speaker", {"status": "error", "status_code": "missing_emotion_error"}),
])
def set_speaker():
speaker = request.json.get("speaker")
user_id = request.json.get("user_id")
SPEAKER[user_id] = speaker
return jsonify({
'status': 'ok'
})
@app.route("/api/set_emotion", methods=["POST"])
@describe(["3dmodel"],
name="Set emotion for 3D model",
description="""Set emotion for 3D model""",
inputs={
"user_id": "This ID from article Unique Identifier for iPHONE",
"emotion": "sad"
},
outputs={
"status": "ok"
})
@check_missing_keys([
("user_id", {"status": "error", "status_code": "missing_user_id_error"}),
("emotion", {"status": "error", "status_code": "missing_emotion_error"}),
])
def set_emotion():
emotion = request.json.get("emotion")
user_id = request.json.get("user_id")
EMOTIONS[user_id] = emotion
return jsonify({
'status': 'ok'
})
@app.route("/api/get_texts", methods=["POST"])
@describe(["text"],
name="Get texts for user_id",
description="""This endpoint get all texts for current iPhone""",
inputs={
"user_id": "This ID from article Unique Identifier for iPHONE"
},
outputs={
"text": "Output",
"id": "bot or user",
})
@check_missing_keys([
("user_id", {"status": "error", "status_code": "missing_user_id_error"}),
])
def get_texts():
user_id = request.json.get("user_id")
return jsonify(TEXTS[user_id])
@app.route("/api/send_text", methods=["POST"])
@describe(["text"],
name="Sent text to miapia",
description="""This endpoint sends texts for client""",
inputs={
"text": "Hello, MIAPIA",
"user_id": "This ID from article Unique Identifier for iPHONE"
},
outputs={
"status": "ok"
})
@check_missing_keys([
("text", {"status": "error", "status_code": "missing_text_error"}),
("user_id", {"status": "error", "status_code": "missing_user_id_error"}),
])
def send_text():
user_id = request.json.get("user_id")
text = request.json.get("text", "")
TEXTS[user_id].append({
"id": 'user',
"text": text
})
output_texts = page.aihandler_stream(text)
bot_text = ""
for output_text in output_texts:
bot_text += " " + output_text
TEXTS[user_id].append({
"id": 'bot',
"text": bot_text
})
return jsonify({
"status": "ok",
"messages": TEXTS[user_id]
})
io = web.get_socketio(app,
engineio_logger=False)
head_memories = {}
TEXTS = defaultdict(list)
EMOTIONS = {}
SPEAKER = {}
def get_event(name, value, timestamp):
return {
"index": model_bsList.index(name),
"value": value,
"timestamp": timestamp
}
def get_value(events, name):
index = model_bsList.index(name)
events = [event for event in events
if event['index'] == index]
if len(events) == 0:
return None
return events[-1]['value']
def get_head_memory():
ids = [100, 101, 103, 104, 106, 107, 109, 110]
return [[0, 0, 1] for _ in range(len(ids))]
def get_head_rotations(alpha, duration, memory, sign):
ids = [100, 101, 103, 104, 106, 107, 109, 110]
for _ in range(3):
index = ids.index(random.choice(ids))
step = 0.01 * sign[index]
memory[index][0] += step
memory[index][0] = min(memory[index][0], memory[index][2])
memory[index][0] = max(memory[index][0], memory[index][1])
print(memory)
return [{
"index": j,
"value": memory[i][0],
"timestamp": float(duration * alpha)
} for i, j in enumerate(ids)], memory
def perform_on_text(output_text, sid, head_memory, sign, voice):
session_id = str(uuid.uuid4())
page.tts.tts_to_file(text=output_text,
speaker_wav="/home/ubuntu/repo/female.wav" if voice == "voice1" else "/home/ubuntu/repo/indian.wav",
language="en",
emotion="Happy",
file_path=f"./audio/{session_id}.wav")
audio_path = f"./audio/{session_id}.wav"
with open(audio_path, 'rb') as f:
audio_content = f.read()
encode_string = base64.b64encode(audio_content).decode('utf-8')
speech_array, sampling_rate = librosa.load(audio_path,
sr=16000)
duration = librosa.get_duration(y=speech_array,
sr=sampling_rate)
output = page.infer(speech_array, sampling_rate)
emit("io_push_audio_blob", {
"dataURL": f"base64,{encode_string}"
}, to=sid)
print("Sent audio.")
emit("io_set_size", {
"size": output.shape[0],
}, to=sid)
t1 = time()
for i in tqdm(range(output.shape[0])):
rots, head_memory = get_head_rotations((i / output.shape[0]), duration, head_memory, sign)
blendshapes_i = [{
"index": j,
"value": output[i, j],
"timestamp": float(duration * (i / output.shape[0]))
} for j in range(output.shape[1])] + rots
if max([get_value(blendshapes_i, target_name)
for target_name in target_names]) > 0.5:
os.makedirs(blendshapes_path,
exist_ok=True)
save_blendshapes_i = os.path.join(blendshapes_path,
str(uuid.uuid4()) + '.json')
with open(save_blendshapes_i, 'w') as f:
json.dump(blendshapes_i, f)
emit("io_set_coef", blendshapes_i, to=sid)
# sleep(0.1 * duration / output.shape[0])
t2 = time()
sleep(max(0., duration - (t2 - t1)))
return head_memory
def perform_surgery(sid, duration=5):
with open("../5-seconds-of-silence.wav", 'rb') as f:
audio_content = f.read()
encode_string = base64.b64encode(audio_content).decode('utf-8')
fps = 20
emit("io_push_audio_blob", {
"dataURL": f"base64,{encode_string}"
}, to=sid)
print("Sent audio.")
emit("io_set_size", {
"size": (fps * duration)
}, to=sid)
t1 = time()
for i in tqdm(range(fps * duration)):
alpha = float(i / (fps * duration))
emit("io_set_coef", [
get_event("eyeWideLeft",
0.3 - 0.3 * alpha,
float(duration * alpha)),
get_event("eyeWideRight",
0.3 - 0.3 * alpha,
float(duration * alpha))
], to=sid)
t2 = time()
sleep(max(0., duration - (t2 - t1)))
@io.on("io_set_text")
def io_set_text(data):
data = json.loads(data)
data = data[0]
sid = None
print(data, file=sys.stderr)
if "text" not in data:
emit("io_error", {"message": "Text not found"},
to=sid)
return
text = data["text"]
"""if "user_id" not in data:
emit("io_error", {"message": "User not found"},
to=sid)
return"""
user_id = data.get('user_id')
print(user_id)
TEXTS[user_id].append({
"id": "user",
"text": text
})
voice = SPEAKER.get(user_id, "voice1")
if sid not in head_memories:
head_memories[sid] = get_head_memory()
head_memory = head_memories[sid]
# output_texts = [page.aihandler(text)['text']]
output_texts = page.aihandler_stream(text)
bot_text = ""
for output_text in output_texts:
sign = [2 * (random.random() > 0.5) - 1
for _ in range(8)]
head_memory = perform_on_text(output_text, sid, head_memory,
sign=sign,
voice=voice)
bot_text += " " + output_text
print("SURGERY STARTED!")
# perform_surgery(sid)
print("SURGERY ENDED!")
TEXTS[user_id].append({
"id": "bot",
"text": bot_text
})
emit("io_finish", {}, to=sid)
io.run(app,
host=host, port=port, debug=debug,
allow_unsafe_werkzeug=True)

154
miapia_own/pieinfer.py Normal file
View File

@@ -0,0 +1,154 @@
import librosa
import numpy as np
import argparse
from torch import cuda
from parse import parse
from scipy.signal import savgol_filter
import torch
from model import EmoTalk
import random
import os, subprocess
import shlex
from munch import Munch
@torch.no_grad()
def test(model, speech_array, sampling_rate):
args = Munch(
bs_dim=52,
feature_dim=832,
period=30,
device="cuda",
model_path="./pretrain_model/EmoTalk.pth",
max_seq_len=5000,
num_workers=0,
batch_size=1,
post_processing=True,
blender_path="./blender/blender")
eye1 = np.array([0.36537236, 0.950235724, 0.95593375, 0.916715622, 0.367256105, 0.119113259, 0.025357503])
eye2 = np.array([0.234776169, 0.909951985, 0.944758058, 0.777862132, 0.191071674, 0.235437036, 0.089163929])
eye3 = np.array([0.870040774, 0.949833691, 0.949418545, 0.695911646, 0.191071674, 0.072576277, 0.007108896])
eye4 = np.array([0.000307991, 0.556701422, 0.952656746, 0.942345619, 0.425857186, 0.148335218, 0.017659493])
# speech_array, sampling_rate = librosa.load(os.path.join(wav_path), sr=16000)
audio = torch.FloatTensor(speech_array).unsqueeze(0).to(args.device)
level = torch.tensor([1]).to(args.device)
person = torch.tensor([0]).to(args.device)
prediction = model.predict(audio, level, person)
prediction = prediction.squeeze().detach().cpu().numpy()
if args.post_processing:
output = np.zeros((prediction.shape[0], prediction.shape[1]))
for i in range(prediction.shape[1]):
output[:, i] = savgol_filter(prediction[:, i], 5, 2)
output[:, 8] = 0
output[:, 9] = 0
i = random.randint(0, 60)
while i < output.shape[0] - 7:
eye_num = random.randint(1, 4)
if eye_num == 1:
output[i:i + 7, 8] = eye1
output[i:i + 7, 9] = eye1
elif eye_num == 2:
output[i:i + 7, 8] = eye2
output[i:i + 7, 9] = eye2
elif eye_num == 3:
output[i:i + 7, 8] = eye3
output[i:i + 7, 9] = eye3
else:
output[i:i + 7, 8] = eye4
output[i:i + 7, 9] = eye4
time1 = random.randint(60, 180)
i = i + time1
return output
else:
return prediction
def render_video(wav_name, model_name):
args = Munch(
bs_dim=52,
feature_dim=832,
period=30,
device="cuda",
model_path="./pretrain_model/EmoTalk.pth",
max_seq_len=5000,
num_workers=0,
batch_size=1,
post_processing=True,
blender_path="./blender/blender")
# wav_name = args.wav_path.split('/')[-1].split('.')[0]
image_path = os.path.join("./audio", wav_name)
os.makedirs(image_path, exist_ok=True)
blender_path = args.blender_path
python_path = f"./{model_name}.py"
blend_path = f"./{model_name}.blend"
print(python_path, blend_path)
# python_path = "./render.py"
# blend_path = "./render.blend"
cmd = '{} -t 64 -b {} -P {} -- "{}" "{}" '.format(blender_path,
blend_path,
python_path,
"./audio/",
wav_name)
cmd = shlex.split(cmd)
p = subprocess.Popen(cmd,
shell=False,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT)
while p.poll() is None:
line = p.stdout.readline().decode('utf-8')
line = line.strip()
if line and line.startswith('Saved: '):
fname = parse("Saved: '{}'", line).fixed[0]
yield fname
else:
print(line)
if p.returncode == 0:
print('Subprogram success')
else:
print('Subprogram failed')
def construct_video(wav_name):
image_path = os.path.join("./audio", wav_name)
os.makedirs(image_path, exist_ok=True)
image_temp = image_path + "/%d.png"
output_path = os.path.join("./audio", wav_name + ".mp4")
cmd = 'ffmpeg -r 30 -i "{}" -i "{}" -pix_fmt yuv420p -s 512x768 "{}" -y'.format(image_temp,
f"./audio/{wav_name}.wav",
output_path)
subprocess.call(cmd, shell=True)
cmd = 'rm -rf "{}"'.format(image_path)
subprocess.call(cmd, shell=True)
class PieInfer(object):
def __init__(self):
args = Munch(
bs_dim=52,
feature_dim=832,
period=30,
device="cuda" if cuda.is_available() else "cpu",
model_path="./pretrain_model/EmoTalk.pth",
max_seq_len=5000,
num_workers=0,
batch_size=1,
post_processing=True,
blender_path="./blender/blender")
#"""
model = EmoTalk(args)
model.load_state_dict(torch.load(args.model_path, map_location=torch.device(args.device)), strict=False)
model = model.to(args.device)
model.eval()
#"""
# model = None
self.model = model
def __call__(self,
speech_array,
sampling_rate):
return test(self.model, speech_array, sampling_rate)

View File

@@ -0,0 +1,78 @@
import sys
import pandas as pd
import argparse
import base64
from flask import send_file, Response
from flask_socketio import emit
from piedemo.fields.ajax_group import AjaxChatField, AjaxGroup
from piedemo.fields.grid import VStack, HStack, SpaceField
from piedemo.fields.inputs.hidden import InputHiddenField
from piedemo.fields.outputs.colored_text import ptext, OutputColoredTextField
from piedemo.fields.outputs.json import OutputJSONField
from piedemo.fields.outputs.progress import ProgressField
from piedemo.fields.outputs.video import OutputVideoField
from piedemo.web import Web
from piedemo.page import Page
from piedemo.hub.svgpil import SVGImage
from piedemo.fields.outputs.table import OutputTableField
from piedemo.fields.inputs.int_list import InputIntListField
from piedemo.fields.navigation import Navigation
from piedemo.fields.inputs.chat import ChatField
import librosa
import uuid
import numpy as np
import redis
import argparse
from scipy.signal import savgol_filter
import torch
import random
import os, subprocess
import shlex
from tqdm import tqdm
class MainPage(Page):
def __init__(self, model_name: str):
super(MainPage, self).__init__()
web = Web({
"": "simple",
"simple": MainPage("render"),
# "nice": MainPage("FemAdv_b350_V2_050523"),
}, use_socketio_support=True)
host = '0.0.0.0'
port = 8011
debug = False
app = web.get_app()
io = web.get_socketio(app)
@io.on("io_set_text")
def io_set_text(data):
sid = None
if "text" not in data:
emit("io_error", {"message": "Text not found"},
to=sid)
encode_string = base64.b64encode(open("../feeling_good.wav", "rb").read())
for i in range(10):
j = random.randint(0, 2)
emit("io_set_coef", [{
"index": j,
"value": i / 10,
}], to=sid)
emit("io_push_audio_blob", {
"dataURL": f"base64,{encode_string}"
}, to=sid)
emit("io_finish", {}, to=sid)
io.run(app,
host=host, port=port, debug=debug,
allow_unsafe_werkzeug=True)

View File

1
server/connect.sh Executable file
View File

@@ -0,0 +1 @@
ssh -i ~/.ssh/id_rsa_miapia ubuntu@54.172.214.227

1
server/sync_code.sh Executable file
View File

@@ -0,0 +1 @@
rsync --rsh='ssh -i ~/.ssh/id_rsa_miapia -o IdentitiesOnly=yes' --exclude='.git' --exclude 'node_modules' -Pav ../miapia_own/ ubuntu@54.172.214.227:/home/ubuntu/repo/EmoTalk_release

1
server/sync_code_mia.sh Executable file
View File

@@ -0,0 +1 @@
rsync --rsh='ssh -i ~/.ssh/id_rsa_miapia -o IdentitiesOnly=yes' --exclude='.git' --exclude 'node_modules' -Pav ../ ubuntu@54.172.214.227:/home/ubuntu/repo/

12
t2a_api.py Normal file
View File

@@ -0,0 +1,12 @@
from transformers import pipeline
import scipy
class T2A(object):
def __init__(self):
self.synthesiser = pipeline("text-to-speech", "suno/bark")
def apply(self, text):
speech = self.synthesiser(text,
forward_params={"do_sample": True})
scipy.io.wavfile.write("bark_out.wav", rate=speech["sampling_rate"], data=speech["audio"])

38
test_a2f_api.py Normal file
View File

@@ -0,0 +1,38 @@
import math
import os
import requests
from pprint import pprint
import argparse
import soundfile
from a2f_api import A2F
parser = argparse.ArgumentParser()
parser.add_argument("audio_path")
parser.add_argument("--host", type=str,
default="https://a2fdemo.piedata.ai/")
args = parser.parse_args()
a2f = A2F(args.host)
print(f"Uploading {args.audio_path}...")
server_audio_path = a2f.upload(args.audio_path)
fname = os.path.basename(server_audio_path)
print("Status: ", a2f.status())
print("EmotionNames: ", a2f.get_emotion_names())
print("Scene Objects: ", a2f.get_scene_objects())
print("Scene Players: ", a2f.get_players())
print("Preprocessing settings: ", a2f.get_pre_settings())
print("Postprocessing settings: ", a2f.get_post_settings())
print("Setting player root: ", a2f.set_player_root("/home/ubuntu/results"))
print("Player root: ", a2f.get_player_root())
print("Setting audio: ", a2f.set_audio(os.path.basename(server_audio_path)))
print("Audio Range: ", a2f.get_audio_range())
print("Running: ", a2f.run())
print("NumKeys: ", a2f.get_number_of_keys())
print("Keys: ", a2f.get_generated_keys())
# print("BlendShape solvers: ", a2f.get_blendshape_solvers())
print("Exporting: ", a2f.export_json("/home/ubuntu/results",
filename=os.path.splitext(fname)[0]))
print(f"Pulling to ./{fname}...")
a2f.pull(fname)