first commit

2024-06-14 00:47:32 +03:00
commit 7591784e34
31 changed files with 3029 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,39 @@
 # Xcode
 .DS_Store
 #build file
 build/
 profile
 *.moved-aside
 DerivedData
 .idea/
 *.xccheckout
 *.xcuserstate
 Thumbs.db
 *.ipa
 *.zip
 ## User settings
 xcuserdata/
 ### SwiftPackageManager ###
 Packages
 xcuserdata
 #CocoaPods
 Pods
 *.cer
 *.mobileprovision
 MiaPia.xcworkspace/xcuserdata/*
 MiaPia.xcodeproj/xcuserdata/*
 **__pycache__**
 *.pyc
 *.mp3
 *.wav
 *.png
 *.blend
--- a/a2f_api.py
+++ b/a2f_api.py
@@ -0,0 +1,216 @@
 import math
 import os
 import requests
 from pprint import pprint
 import argparse
 import soundfile
 class A2F(object):
    ROOT_PATH = "/home/ubuntu/results"
    BASE_PATH = os.path.expanduser("~/.local/share/ov/pkg/audio2face-2023.2.0/")
    ASSETS_PATH = os.path.join(BASE_PATH, "exts/omni.audio2face.tool/deps/audio2face-assets")
    CLAIRE_PATH = os.path.join(ASSETS_PATH, "claire/mesh/claire_fullface_model.usd")
    MARK_PATH = os.path.join(ASSETS_PATH, "mark/mesh/mark_fullface_model.usd")
    PLAYER_NAME = "/World/audio2face/Player"
    FULLFACE_MODEL_NAME = "/World/audio2face/CoreFullface"
    def __init__(self, url="http://localhost:8011/"):
        self.url = url
    def status(self):
        resp = requests.get(f"{self.url}status")
        return resp.json()
    def get_emotion_names(self):
        resp = requests.get(f"{self.url}A2F/A2E/GetEmotionNames")
        return resp.json().get('result', [])
    def get_scene_objects(self):
        resp = requests.get(f"{self.url}A2F/GetInstances")
        return resp.json().get('result', {})
    def get_players(self):
        resp = requests.get(f"{self.url}A2F/Player/GetInstances")
        return resp.json().get('result', {})
    def load_usd(self, usd_path):
        resp = requests.post(f"{self.url}A2F/USD/Load",
                             json={
                                 "file_name": usd_path,
                             })
        return resp.json()
    def load_claire(self):
        print("Claire path: ", self.CLAIRE_PATH)
        return self.load_usd(self.CLAIRE_PATH)
    def load_mark(self):
        print("Mark path: ", self.MARK_PATH)
        return self.load_usd(self.MARK_PATH)
    def openapi(self):
        resp = requests.get(f"{self.url}openapi.json")
        return resp.json()
    def get_frame(self):
        pass
    def get_settings(self):
        resp = requests.post(f"{self.url}A2F/GetSettings", json={
            "a2f_instance": "",
        })
        return resp.json()
    def get_player_root(self):
        resp = requests.post(f"{self.url}A2F/Player/GetRootPath", json={
            "a2f_player": self.PLAYER_NAME,
        })
        return resp.json()
    def set_player_root(self, new_path):
        resp = requests.post(f"{self.url}A2F/Player/SetRootPath", json={
            "a2f_player": self.PLAYER_NAME,
            "dir_path": new_path,
        })
        return resp.json()
    def set_audio(self, audio_path):
        duration = soundfile.info(audio_path).duration
        print("Audio duration: ", duration)
        resp = requests.post(f"{self.url}A2F/Player/SetTrack", json={
            "a2f_player": self.PLAYER_NAME,
            "file_name": audio_path,
            "time_range": [
                0,
                duration
            ]
        })
        data = [resp.json()]
        resp = requests.post(f"{self.url}A2F/Player/SetRange", json={
            "a2f_player": self.PLAYER_NAME,
            "start": 0,
            "end": duration
        })
        data.append(resp.json())
        resp = requests.post(f"{self.url}A2F/Player/GetTracks", json={
            "a2f_player": self.PLAYER_NAME,
        })
        data.append(resp.json())
        resp = requests.post(f"{self.url}A2F/Player/GetCurrentTrack", json={
            "a2f_player": self.PLAYER_NAME,
        })
        data.append(resp.json())
        return data
    def get_audio_range(self):
        resp = requests.post(f"{self.url}A2F/Player/GetRange", json={
            "a2f_player": self.PLAYER_NAME,
        })
        return resp.json()
    def run(self):
        resp = requests.post(f"{self.url}A2F/A2E/GenerateKeys", json={
            "a2f_instance": self.FULLFACE_MODEL_NAME,
        })
        return resp.json()
    def get_number_of_keys(self):
        resp = requests.post(f"{self.url}A2F/A2E/NumKeys", json={
            "a2f_instance": self.FULLFACE_MODEL_NAME,
        })
        return resp.json()
    def get_generated_keys(self):
        resp = requests.post(f"{self.url}A2F/A2E/GetKeyData", json={
            "a2f_instance": self.FULLFACE_MODEL_NAME,
        })
        return resp.json()
    def get_a2e_settings(self):
        resp = requests.post(f"{self.url}A2F/A2E/GetSettings", json={
            "a2f_instance": self.FULLFACE_MODEL_NAME,
        })
        return resp.json()
    def get_blendshape_solvers(self):
        resp = requests.get(f"{self.url}A2F/Exporter/GetBlendShapeSolvers")
        return resp.json()
    def get_pre_settings(self):
        resp = requests.post(f"{self.url}A2F/PRE/SetSettings", json={
            "a2f_instance": self.FULLFACE_MODEL_NAME,
            "prediction_delay": 0.01,
        })
        resp = requests.post(f"{self.url}A2F/PRE/GetSettings", json={
            "a2f_instance": self.FULLFACE_MODEL_NAME,
        })
        return resp.json()
    def get_post_settings(self):
        resp = requests.post(f"{self.url}A2F/POST/GetSettings", json={
            "a2f_instance": self.FULLFACE_MODEL_NAME,
        })
        return resp.json()
    def export(self, export_path, filename):
        resp = requests.post(f"{self.url}A2F/Exporter/ExportGeometryCache", json={
            "export_directory": export_path,
            "cache_type": "usd",
            "xform_keys": False,
            "batch": False,
            "file_name": filename,
            "fps": 0
        })
        try:
            return resp.json()
        except:
            print(resp.content)
    def export_json(self, export_path, filename):
        resp = requests.post(f"{self.url}A2F/Exporter/ExportBlendshapes", json={
            "export_directory": export_path,
            "format": "json",
            "batch": False,
            "file_name": filename,
            "fps": 0
        })
        try:
            return resp.json()
        except:
            print(resp.content)
    def upload(self, audio_path):
        audio_path = os.path.abspath(audio_path)
        fname = os.path.basename(audio_path)
        os.system(f"cd ./server && ./send_file_to_gui.sh {audio_path} ../results/{fname}")
        return os.path.join("/home/ubuntu/results", fname)
    def pull(self, fname):
        export_fname = os.path.splitext(fname)[0] + '.usd'
        os.system(f"cd ./server && ./send_file_from_gui.sh ../results/{export_fname} ../")
    def apply(self, audio_path):
        fname = os.path.basename(audio_path)
        print("Status: ", self.status())
        print("EmotionNames: ", self.get_emotion_names())
        print("Scene Objects: ", self.get_scene_objects())
        print("Scene Players: ", self.get_players())
        print("Preprocessing settings: ", self.get_pre_settings())
        print("Postprocessing settings: ", self.get_post_settings())
        print("Setting player root: ", self.set_player_root("/home/ubuntu/results"))
        print("Player root: ", self.get_player_root())
        print("Setting audio: ", self.set_audio(os.path.basename(audio_path)))
        print("Audio Range: ", self.get_audio_range())
        print("Running: ", self.run())
        print("NumKeys: ", self.get_number_of_keys())
        print("Keys: ", self.get_generated_keys())
        print("Exporting: ", self.export_json("/home/ubuntu/results",
                                              filename=os.path.splitext(fname)[0]))
    def apply_stream(self, audio_path):
        pass
--- a/audio2face.py
+++ b/audio2face.py
@@ -0,0 +1,50 @@
 # speech to Audio2Face module utilizing the gRPC protocal from audio2face_streaming_utils
 import io
 from pydub import AudioSegment
 from scipy.io.wavfile import read
 import numpy as np
 from audio2face_streaming_utils import push_audio_track
 class Audio2FaceService:
    def __init__(self, sample_rate=44100):
        """
        :param sample_rate: sample rate
        """
        self.a2f_url = 'localhost:50051'   # Set it to the port of your local host 
        self.sample_rate = 44100
        self.avatar_instance = '/World/audio2face/PlayerStreaming'   # Set it to the name of your Audio2Face Streaming Instance
    def tts_to_wav(self, tts_byte, framerate=22050) -> str:
        """
        :param tts_byte: tts data in byte
        :param framerate: framerate
        :return: wav byte
        """
        seg = AudioSegment.from_raw(io.BytesIO(tts_byte), sample_width=2, frame_rate=22050, channels=1)
        wavIO = io.BytesIO()
        seg.export(wavIO, format="wav")
        rate, wav = read(io.BytesIO(wavIO.getvalue()))
        return wav
    def wav_to_numpy_float32(self, wav_byte) -> float:
        """
        :param wav_byte: wav byte
        :return: float32
        """
        return wav_byte.astype(np.float32, order='C') / 32768.0
    def get_tts_numpy_audio(self, audio) -> float:
        """
        :param audio: audio from tts_to_wav
        :return: float32 of the audio
        """
        wav_byte = self.tts_to_wav(audio)
        return self.wav_to_numpy_float32(wav_byte)
    def make_avatar_speaks(self, audio) -> None:
        """
        :param audio: tts audio
        :return: None
        """
        push_audio_track(self.a2f_url, self.get_tts_numpy_audio(audio), self.sample_rate, self.avatar_instance)
--- a/audio2face_pb2.py
+++ b/audio2face_pb2.py
@@ -0,0 +1,502 @@
 # -*- coding: utf-8 -*-
 # Generated by the protocol buffer compiler.  DO NOT EDIT!
 # source: audio2face.proto
 """Generated protocol buffer code."""
 from google.protobuf import descriptor as _descriptor
 from google.protobuf import message as _message
 from google.protobuf import reflection as _reflection
 from google.protobuf import symbol_database as _symbol_database
 # @@protoc_insertion_point(imports)
 _sym_db = _symbol_database.Default()
 DESCRIPTOR = _descriptor.FileDescriptor(
    name="audio2face.proto",
    package="nvidia.audio2face",
    syntax="proto3",
    serialized_options=None,
    create_key=_descriptor._internal_create_key,
    serialized_pb=b'\n\x10\x61udio2face.proto\x12\x11nvidia.audio2face"{\n\x10PushAudioRequest\x12\x15\n\rinstance_name\x18\x01 \x01(\t\x12\x12\n\nsamplerate\x18\x02 \x01(\x05\x12\x12\n\naudio_data\x18\x03 \x01(\x0c\x12(\n block_until_playback_is_finished\x18\x04 \x01(\x08"5\n\x11PushAudioResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t"\x85\x01\n\x16PushAudioStreamRequest\x12@\n\x0cstart_marker\x18\x01 \x01(\x0b\x32(.nvidia.audio2face.PushAudioRequestStartH\x00\x12\x14\n\naudio_data\x18\x02 \x01(\x0cH\x00\x42\x13\n\x11streaming_request"l\n\x15PushAudioRequestStart\x12\x15\n\rinstance_name\x18\x01 \x01(\t\x12\x12\n\nsamplerate\x18\x02 \x01(\x05\x12(\n block_until_playback_is_finished\x18\x03 \x01(\x08";\n\x17PushAudioStreamResponse\x12\x0f\n\x07success\x18\x01 \x01(\x08\x12\x0f\n\x07message\x18\x02 \x01(\t2\xd4\x01\n\nAudio2Face\x12X\n\tPushAudio\x12#.nvidia.audio2face.PushAudioRequest\x1a$.nvidia.audio2face.PushAudioResponse"\x00\x12l\n\x0fPushAudioStream\x12).nvidia.audio2face.PushAudioStreamRequest\x1a*.nvidia.audio2face.PushAudioStreamResponse"\x00(\x01\x62\x06proto3',
 )
 _PUSHAUDIOREQUEST = _descriptor.Descriptor(
    name="PushAudioRequest",
    full_name="nvidia.audio2face.PushAudioRequest",
    filename=None,
    file=DESCRIPTOR,
    containing_type=None,
    create_key=_descriptor._internal_create_key,
    fields=[
        _descriptor.FieldDescriptor(
            name="instance_name",
            full_name="nvidia.audio2face.PushAudioRequest.instance_name",
            index=0,
            number=1,
            type=9,
            cpp_type=9,
            label=1,
            has_default_value=False,
            default_value=b"".decode("utf-8"),
            message_type=None,
            enum_type=None,
            containing_type=None,
            is_extension=False,
            extension_scope=None,
            serialized_options=None,
            file=DESCRIPTOR,
            create_key=_descriptor._internal_create_key,
        ),
        _descriptor.FieldDescriptor(
            name="samplerate",
            full_name="nvidia.audio2face.PushAudioRequest.samplerate",
            index=1,
            number=2,
            type=5,
            cpp_type=1,
            label=1,
            has_default_value=False,
            default_value=0,
            message_type=None,
            enum_type=None,
            containing_type=None,
            is_extension=False,
            extension_scope=None,
            serialized_options=None,
            file=DESCRIPTOR,
            create_key=_descriptor._internal_create_key,
        ),
        _descriptor.FieldDescriptor(
            name="audio_data",
            full_name="nvidia.audio2face.PushAudioRequest.audio_data",
            index=2,
            number=3,
            type=12,
            cpp_type=9,
            label=1,
            has_default_value=False,
            default_value=b"",
            message_type=None,
            enum_type=None,
            containing_type=None,
            is_extension=False,
            extension_scope=None,
            serialized_options=None,
            file=DESCRIPTOR,
            create_key=_descriptor._internal_create_key,
        ),
        _descriptor.FieldDescriptor(
            name="block_until_playback_is_finished",
            full_name="nvidia.audio2face.PushAudioRequest.block_until_playback_is_finished",
            index=3,
            number=4,
            type=8,
            cpp_type=7,
            label=1,
            has_default_value=False,
            default_value=False,
            message_type=None,
            enum_type=None,
            containing_type=None,
            is_extension=False,
            extension_scope=None,
            serialized_options=None,
            file=DESCRIPTOR,
            create_key=_descriptor._internal_create_key,
        ),
    ],
    extensions=[],
    nested_types=[],
    enum_types=[],
    serialized_options=None,
    is_extendable=False,
    syntax="proto3",
    extension_ranges=[],
    oneofs=[],
    serialized_start=39,
    serialized_end=162,
 )
 _PUSHAUDIORESPONSE = _descriptor.Descriptor(
    name="PushAudioResponse",
    full_name="nvidia.audio2face.PushAudioResponse",
    filename=None,
    file=DESCRIPTOR,
    containing_type=None,
    create_key=_descriptor._internal_create_key,
    fields=[
        _descriptor.FieldDescriptor(
            name="success",
            full_name="nvidia.audio2face.PushAudioResponse.success",
            index=0,
            number=1,
            type=8,
            cpp_type=7,
            label=1,
            has_default_value=False,
            default_value=False,
            message_type=None,
            enum_type=None,
            containing_type=None,
            is_extension=False,
            extension_scope=None,
            serialized_options=None,
            file=DESCRIPTOR,
            create_key=_descriptor._internal_create_key,
        ),
        _descriptor.FieldDescriptor(
            name="message",
            full_name="nvidia.audio2face.PushAudioResponse.message",
            index=1,
            number=2,
            type=9,
            cpp_type=9,
            label=1,
            has_default_value=False,
            default_value=b"".decode("utf-8"),
            message_type=None,
            enum_type=None,
            containing_type=None,
            is_extension=False,
            extension_scope=None,
            serialized_options=None,
            file=DESCRIPTOR,
            create_key=_descriptor._internal_create_key,
        ),
    ],
    extensions=[],
    nested_types=[],
    enum_types=[],
    serialized_options=None,
    is_extendable=False,
    syntax="proto3",
    extension_ranges=[],
    oneofs=[],
    serialized_start=164,
    serialized_end=217,
 )
 _PUSHAUDIOSTREAMREQUEST = _descriptor.Descriptor(
    name="PushAudioStreamRequest",
    full_name="nvidia.audio2face.PushAudioStreamRequest",
    filename=None,
    file=DESCRIPTOR,
    containing_type=None,
    create_key=_descriptor._internal_create_key,
    fields=[
        _descriptor.FieldDescriptor(
            name="start_marker",
            full_name="nvidia.audio2face.PushAudioStreamRequest.start_marker",
            index=0,
            number=1,
            type=11,
            cpp_type=10,
            label=1,
            has_default_value=False,
            default_value=None,
            message_type=None,
            enum_type=None,
            containing_type=None,
            is_extension=False,
            extension_scope=None,
            serialized_options=None,
            file=DESCRIPTOR,
            create_key=_descriptor._internal_create_key,
        ),
        _descriptor.FieldDescriptor(
            name="audio_data",
            full_name="nvidia.audio2face.PushAudioStreamRequest.audio_data",
            index=1,
            number=2,
            type=12,
            cpp_type=9,
            label=1,
            has_default_value=False,
            default_value=b"",
            message_type=None,
            enum_type=None,
            containing_type=None,
            is_extension=False,
            extension_scope=None,
            serialized_options=None,
            file=DESCRIPTOR,
            create_key=_descriptor._internal_create_key,
        ),
    ],
    extensions=[],
    nested_types=[],
    enum_types=[],
    serialized_options=None,
    is_extendable=False,
    syntax="proto3",
    extension_ranges=[],
    oneofs=[
        _descriptor.OneofDescriptor(
            name="streaming_request",
            full_name="nvidia.audio2face.PushAudioStreamRequest.streaming_request",
            index=0,
            containing_type=None,
            create_key=_descriptor._internal_create_key,
            fields=[],
        )
    ],
    serialized_start=220,
    serialized_end=353,
 )
 _PUSHAUDIOREQUESTSTART = _descriptor.Descriptor(
    name="PushAudioRequestStart",
    full_name="nvidia.audio2face.PushAudioRequestStart",
    filename=None,
    file=DESCRIPTOR,
    containing_type=None,
    create_key=_descriptor._internal_create_key,
    fields=[
        _descriptor.FieldDescriptor(
            name="instance_name",
            full_name="nvidia.audio2face.PushAudioRequestStart.instance_name",
            index=0,
            number=1,
            type=9,
            cpp_type=9,
            label=1,
            has_default_value=False,
            default_value=b"".decode("utf-8"),
            message_type=None,
            enum_type=None,
            containing_type=None,
            is_extension=False,
            extension_scope=None,
            serialized_options=None,
            file=DESCRIPTOR,
            create_key=_descriptor._internal_create_key,
        ),
        _descriptor.FieldDescriptor(
            name="samplerate",
            full_name="nvidia.audio2face.PushAudioRequestStart.samplerate",
            index=1,
            number=2,
            type=5,
            cpp_type=1,
            label=1,
            has_default_value=False,
            default_value=0,
            message_type=None,
            enum_type=None,
            containing_type=None,
            is_extension=False,
            extension_scope=None,
            serialized_options=None,
            file=DESCRIPTOR,
            create_key=_descriptor._internal_create_key,
        ),
        _descriptor.FieldDescriptor(
            name="block_until_playback_is_finished",
            full_name="nvidia.audio2face.PushAudioRequestStart.block_until_playback_is_finished",
            index=2,
            number=3,
            type=8,
            cpp_type=7,
            label=1,
            has_default_value=False,
            default_value=False,
            message_type=None,
            enum_type=None,
            containing_type=None,
            is_extension=False,
            extension_scope=None,
            serialized_options=None,
            file=DESCRIPTOR,
            create_key=_descriptor._internal_create_key,
        ),
    ],
    extensions=[],
    nested_types=[],
    enum_types=[],
    serialized_options=None,
    is_extendable=False,
    syntax="proto3",
    extension_ranges=[],
    oneofs=[],
    serialized_start=355,
    serialized_end=463,
 )
 _PUSHAUDIOSTREAMRESPONSE = _descriptor.Descriptor(
    name="PushAudioStreamResponse",
    full_name="nvidia.audio2face.PushAudioStreamResponse",
    filename=None,
    file=DESCRIPTOR,
    containing_type=None,
    create_key=_descriptor._internal_create_key,
    fields=[
        _descriptor.FieldDescriptor(
            name="success",
            full_name="nvidia.audio2face.PushAudioStreamResponse.success",
            index=0,
            number=1,
            type=8,
            cpp_type=7,
            label=1,
            has_default_value=False,
            default_value=False,
            message_type=None,
            enum_type=None,
            containing_type=None,
            is_extension=False,
            extension_scope=None,
            serialized_options=None,
            file=DESCRIPTOR,
            create_key=_descriptor._internal_create_key,
        ),
        _descriptor.FieldDescriptor(
            name="message",
            full_name="nvidia.audio2face.PushAudioStreamResponse.message",
            index=1,
            number=2,
            type=9,
            cpp_type=9,
            label=1,
            has_default_value=False,
            default_value=b"".decode("utf-8"),
            message_type=None,
            enum_type=None,
            containing_type=None,
            is_extension=False,
            extension_scope=None,
            serialized_options=None,
            file=DESCRIPTOR,
            create_key=_descriptor._internal_create_key,
        ),
    ],
    extensions=[],
    nested_types=[],
    enum_types=[],
    serialized_options=None,
    is_extendable=False,
    syntax="proto3",
    extension_ranges=[],
    oneofs=[],
    serialized_start=465,
    serialized_end=524,
 )
 _PUSHAUDIOSTREAMREQUEST.fields_by_name["start_marker"].message_type = _PUSHAUDIOREQUESTSTART
 _PUSHAUDIOSTREAMREQUEST.oneofs_by_name["streaming_request"].fields.append(
    _PUSHAUDIOSTREAMREQUEST.fields_by_name["start_marker"]
 )
 _PUSHAUDIOSTREAMREQUEST.fields_by_name["start_marker"].containing_oneof = _PUSHAUDIOSTREAMREQUEST.oneofs_by_name[
    "streaming_request"
 ]
 _PUSHAUDIOSTREAMREQUEST.oneofs_by_name["streaming_request"].fields.append(
    _PUSHAUDIOSTREAMREQUEST.fields_by_name["audio_data"]
 )
 _PUSHAUDIOSTREAMREQUEST.fields_by_name["audio_data"].containing_oneof = _PUSHAUDIOSTREAMREQUEST.oneofs_by_name[
    "streaming_request"
 ]
 DESCRIPTOR.message_types_by_name["PushAudioRequest"] = _PUSHAUDIOREQUEST
 DESCRIPTOR.message_types_by_name["PushAudioResponse"] = _PUSHAUDIORESPONSE
 DESCRIPTOR.message_types_by_name["PushAudioStreamRequest"] = _PUSHAUDIOSTREAMREQUEST
 DESCRIPTOR.message_types_by_name["PushAudioRequestStart"] = _PUSHAUDIOREQUESTSTART
 DESCRIPTOR.message_types_by_name["PushAudioStreamResponse"] = _PUSHAUDIOSTREAMRESPONSE
 _sym_db.RegisterFileDescriptor(DESCRIPTOR)
 PushAudioRequest = _reflection.GeneratedProtocolMessageType(
    "PushAudioRequest",
    (_message.Message,),
    {
        "DESCRIPTOR": _PUSHAUDIOREQUEST,
        "__module__": "audio2face_pb2"
        # @@protoc_insertion_point(class_scope:nvidia.audio2face.PushAudioRequest)
    },
 )
 _sym_db.RegisterMessage(PushAudioRequest)
 PushAudioResponse = _reflection.GeneratedProtocolMessageType(
    "PushAudioResponse",
    (_message.Message,),
    {
        "DESCRIPTOR": _PUSHAUDIORESPONSE,
        "__module__": "audio2face_pb2"
        # @@protoc_insertion_point(class_scope:nvidia.audio2face.PushAudioResponse)
    },
 )
 _sym_db.RegisterMessage(PushAudioResponse)
 PushAudioStreamRequest = _reflection.GeneratedProtocolMessageType(
    "PushAudioStreamRequest",
    (_message.Message,),
    {
        "DESCRIPTOR": _PUSHAUDIOSTREAMREQUEST,
        "__module__": "audio2face_pb2"
        # @@protoc_insertion_point(class_scope:nvidia.audio2face.PushAudioStreamRequest)
    },
 )
 _sym_db.RegisterMessage(PushAudioStreamRequest)
 PushAudioRequestStart = _reflection.GeneratedProtocolMessageType(
    "PushAudioRequestStart",
    (_message.Message,),
    {
        "DESCRIPTOR": _PUSHAUDIOREQUESTSTART,
        "__module__": "audio2face_pb2"
        # @@protoc_insertion_point(class_scope:nvidia.audio2face.PushAudioRequestStart)
    },
 )
 _sym_db.RegisterMessage(PushAudioRequestStart)
 PushAudioStreamResponse = _reflection.GeneratedProtocolMessageType(
    "PushAudioStreamResponse",
    (_message.Message,),
    {
        "DESCRIPTOR": _PUSHAUDIOSTREAMRESPONSE,
        "__module__": "audio2face_pb2"
        # @@protoc_insertion_point(class_scope:nvidia.audio2face.PushAudioStreamResponse)
    },
 )
 _sym_db.RegisterMessage(PushAudioStreamResponse)
 _AUDIO2FACE = _descriptor.ServiceDescriptor(
    name="Audio2Face",
    full_name="nvidia.audio2face.Audio2Face",
    file=DESCRIPTOR,
    index=0,
    serialized_options=None,
    create_key=_descriptor._internal_create_key,
    serialized_start=527,
    serialized_end=739,
    methods=[
        _descriptor.MethodDescriptor(
            name="PushAudio",
            full_name="nvidia.audio2face.Audio2Face.PushAudio",
            index=0,
            containing_service=None,
            input_type=_PUSHAUDIOREQUEST,
            output_type=_PUSHAUDIORESPONSE,
            serialized_options=None,
            create_key=_descriptor._internal_create_key,
        ),
        _descriptor.MethodDescriptor(
            name="PushAudioStream",
            full_name="nvidia.audio2face.Audio2Face.PushAudioStream",
            index=1,
            containing_service=None,
            input_type=_PUSHAUDIOSTREAMREQUEST,
            output_type=_PUSHAUDIOSTREAMRESPONSE,
            serialized_options=None,
            create_key=_descriptor._internal_create_key,
        ),
    ],
 )
 _sym_db.RegisterServiceDescriptor(_AUDIO2FACE)
 DESCRIPTOR.services_by_name["Audio2Face"] = _AUDIO2FACE
 # @@protoc_insertion_point(module_scope)
--- a/audio2face_pb2_grpc.py
+++ b/audio2face_pb2_grpc.py
@@ -0,0 +1,122 @@
 # Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
 """Client and server classes corresponding to protobuf-defined services."""
 import grpc
 import audio2face_pb2 as audio2face__pb2
 class Audio2FaceStub(object):
    """Missing associated documentation comment in .proto file."""
    def __init__(self, channel):
        """Constructor.
        Args:
            channel: A grpc.Channel.
        """
        self.PushAudio = channel.unary_unary(
            "/nvidia.audio2face.Audio2Face/PushAudio",
            request_serializer=audio2face__pb2.PushAudioRequest.SerializeToString,
            response_deserializer=audio2face__pb2.PushAudioResponse.FromString,
        )
        self.PushAudioStream = channel.stream_unary(
            "/nvidia.audio2face.Audio2Face/PushAudioStream",
            request_serializer=audio2face__pb2.PushAudioStreamRequest.SerializeToString,
            response_deserializer=audio2face__pb2.PushAudioStreamResponse.FromString,
        )
 class Audio2FaceServicer(object):
    """Missing associated documentation comment in .proto file."""
    def PushAudio(self, request, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details("Method not implemented!")
        raise NotImplementedError("Method not implemented!")
    def PushAudioStream(self, request_iterator, context):
        """Missing associated documentation comment in .proto file."""
        context.set_code(grpc.StatusCode.UNIMPLEMENTED)
        context.set_details("Method not implemented!")
        raise NotImplementedError("Method not implemented!")
 def add_Audio2FaceServicer_to_server(servicer, server):
    rpc_method_handlers = {
        "PushAudio": grpc.unary_unary_rpc_method_handler(
            servicer.PushAudio,
            request_deserializer=audio2face__pb2.PushAudioRequest.FromString,
            response_serializer=audio2face__pb2.PushAudioResponse.SerializeToString,
        ),
        "PushAudioStream": grpc.stream_unary_rpc_method_handler(
            servicer.PushAudioStream,
            request_deserializer=audio2face__pb2.PushAudioStreamRequest.FromString,
            response_serializer=audio2face__pb2.PushAudioStreamResponse.SerializeToString,
        ),
    }
    generic_handler = grpc.method_handlers_generic_handler("nvidia.audio2face.Audio2Face", rpc_method_handlers)
    server.add_generic_rpc_handlers((generic_handler,))
 # This class is part of an EXPERIMENTAL API.
 class Audio2Face(object):
    """Missing associated documentation comment in .proto file."""
    @staticmethod
    def PushAudio(
        request,
        target,
        options=(),
        channel_credentials=None,
        call_credentials=None,
        insecure=False,
        compression=None,
        wait_for_ready=None,
        timeout=None,
        metadata=None,
    ):
        return grpc.experimental.unary_unary(
            request,
            target,
            "/nvidia.audio2face.Audio2Face/PushAudio",
            audio2face__pb2.PushAudioRequest.SerializeToString,
            audio2face__pb2.PushAudioResponse.FromString,
            options,
            channel_credentials,
            insecure,
            call_credentials,
            compression,
            wait_for_ready,
            timeout,
            metadata,
        )
    @staticmethod
    def PushAudioStream(
        request_iterator,
        target,
        options=(),
        channel_credentials=None,
        call_credentials=None,
        insecure=False,
        compression=None,
        wait_for_ready=None,
        timeout=None,
        metadata=None,
    ):
        return grpc.experimental.stream_unary(
            request_iterator,
            target,
            "/nvidia.audio2face.Audio2Face/PushAudioStream",
            audio2face__pb2.PushAudioStreamRequest.SerializeToString,
            audio2face__pb2.PushAudioStreamResponse.FromString,
            options,
            channel_credentials,
            insecure,
            call_credentials,
            compression,
            wait_for_ready,
            timeout,
            metadata,
        )
--- a/audio2face_streaming_utils.py
+++ b/audio2face_streaming_utils.py
@@ -0,0 +1,142 @@
 """
 This demo script shows how to send audio data to Audio2Face Streaming Audio Player via gRPC requests.
 There are two options:
 * Send the whole track at once using PushAudioRequest()
 * Send the audio chunks seuqntially in a stream using PushAudioStreamRequest()
 For the second option this script emulates the stream of chunks, generated by splitting an input WAV audio file.
 But in a real application such stream of chunks may be aquired from some other streaming source:
 * streaming audio via internet, streaming Text-To-Speech, etc
 gRPC protocol details could be find in audio2face.proto
 """
 import sys
 import grpc
 import time
 import numpy as np
 import soundfile
 import audio2face_pb2
 import audio2face_pb2_grpc
 def push_audio_track(url, audio_data, samplerate, instance_name):
    """
    This function pushes the whole audio track at once via PushAudioRequest()
    PushAudioRequest parameters:
     * audio_data: bytes, containing audio data for the whole track, where each sample is encoded as 4 bytes (float32)
     * samplerate: sampling rate for the audio data
     * instance_name: prim path of the Audio2Face Streaming Audio Player on the stage, were to push the audio data
     * block_until_playback_is_finished: if True, the gRPC request will be blocked until the playback of the pushed track is finished
    The request is passed to PushAudio()
    """
    block_until_playback_is_finished = True  # ADJUST
    with grpc.insecure_channel(url) as channel:
        stub = audio2face_pb2_grpc.Audio2FaceStub(channel)
        request = audio2face_pb2.PushAudioRequest()
        request.audio_data = audio_data.astype(np.float32).tobytes()
        request.samplerate = samplerate
        request.instance_name = instance_name
        request.block_until_playback_is_finished = block_until_playback_is_finished
        print("Sending audio data...")
        response = stub.PushAudio(request)
        if response.success:
            print("SUCCESS")
        else:
            print(f"ERROR: {response.message}")
    print("Closed channel")
 def push_audio_track_stream(url, audio_data, samplerate, instance_name):
    """
    This function pushes audio chunks sequentially via PushAudioStreamRequest()
    The function emulates the stream of chunks, generated by splitting input audio track.
    But in a real application such stream of chunks may be aquired from some other streaming source.
    The first message must contain start_marker field, containing only meta information (without audio data):
     * samplerate: sampling rate for the audio data
     * instance_name: prim path of the Audio2Face Streaming Audio Player on the stage, were to push the audio data
     * block_until_playback_is_finished: if True, the gRPC request will be blocked until the playback of the pushed track is finished (after the last message)
    Second and other messages must contain audio_data field:
     * audio_data: bytes, containing audio data for an audio chunk, where each sample is encoded as 4 bytes (float32)
    All messages are packed into a Python generator and passed to PushAudioStream()
    """
    chunk_size = samplerate // 10  # ADJUST
    sleep_between_chunks = 0.04  # ADJUST
    block_until_playback_is_finished = True  # ADJUST
    with grpc.insecure_channel(url) as channel:
        print("Channel creadted")
        stub = audio2face_pb2_grpc.Audio2FaceStub(channel)
        def make_generator():
            start_marker = audio2face_pb2.PushAudioRequestStart(
                samplerate=samplerate,
                instance_name=instance_name,
                block_until_playback_is_finished=block_until_playback_is_finished,
            )
            # At first, we send a message with start_marker
            yield audio2face_pb2.PushAudioStreamRequest(start_marker=start_marker)
            # Then we send messages with audio_data
            for i in range(len(audio_data) // chunk_size + 1):
                time.sleep(sleep_between_chunks)
                chunk = audio_data[i * chunk_size : i * chunk_size + chunk_size]
                yield audio2face_pb2.PushAudioStreamRequest(audio_data=chunk.astype(np.float32).tobytes())
        request_generator = make_generator()
        print("Sending audio data...")
        response = stub.PushAudioStream(request_generator)
        if response.success:
            print("SUCCESS")
        else:
            print(f"ERROR: {response.message}")
    print("Channel closed")
 def main():
    """
    This demo script shows how to send audio data to Audio2Face Streaming Audio Player via gRPC requests.
    There two options:
     * Send the whole track at once using PushAudioRequest()
     * Send the audio chunks seuqntially in a stream using PushAudioStreamRequest()
    For the second option this script emulates the stream of chunks, generated by splitting an input WAV audio file.
    But in a real application such stream of chunks may be aquired from some other streaming source:
     * streaming audio via internet, streaming Text-To-Speech, etc
    gRPC protocol details could be find in audio2face.proto
    """
    if len(sys.argv) < 3:
        print("Format: python test_client.py PATH_TO_WAV INSTANCE_NAME")
        return
    # Sleep time emulates long latency of the request
    sleep_time = 2.0  # ADJUST
    # URL of the Audio2Face Streaming Audio Player server (where A2F App is running)
    url = "localhost:50051"  # ADJUST
    # Local input WAV file path
    audio_fpath = sys.argv[1]
    # Prim path of the Audio2Face Streaming Audio Player on the stage (were to push the audio data)
    instance_name = sys.argv[2]
    data, samplerate = soundfile.read(audio_fpath, dtype="float32")
    # Only Mono audio is supported
    if len(data.shape) > 1:
        data = np.average(data, axis=1)
    print(f"Sleeping for {sleep_time} seconds")
    time.sleep(sleep_time)
    if 0:  # ADJUST
        # Push the whole audio track at once
        push_audio_track(url, data, samplerate, instance_name)
    else:
        # Emulate audio stream and push audio chunks sequentially
        push_audio_track_stream(url, data, samplerate, instance_name)
 if __name__ == "__main__":
    main()
--- a/emotalk_own/Dockerfile
+++ b/emotalk_own/Dockerfile
@@ -0,0 +1,54 @@
 FROM nvidia/cudagl:11.3.1-devel-ubuntu20.04
 MAINTAINER "Jungwoo Choi"
 ARG DEBIAN_FRONTEND=noninteractive
 ENV TZ=Asia/Seoul
 ADD requirements.txt /tmp/requirements.txt
 RUN \
    # Fix CUDA apt error
    rm -f /etc/apt/sources.list.d/cuda.list  && \
    rm -f /etc/apt/sources.list.d/nvidia-ml.list  && \
    apt-get update && apt-get install -y gnupg2 software-properties-common && \
    apt-key del 7fa2af80  && \
    apt-get update && apt-get install -y --no-install-recommends wget  && \
    wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/cuda-keyring_1.0-1_all.deb  && \
    dpkg -i cuda-keyring_1.0-1_all.deb  && \
    # Install Start 
    apt update  && \
    add-apt-repository -y ppa:savoury1/ffmpeg4 && \
    apt -y install python3.8 python3.8-distutils libgl1-mesa-glx libglib2.0-0 git wget zsh vim openssh-server curl ffmpeg && \
    # Python Library 
    update-alternatives --install /usr/bin/python python /usr/bin/python3.8 1 && \
    wget https://bootstrap.pypa.io/get-pip.py && \
    python3 get-pip.py && \
    pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113  && \
    pip install -r /tmp/requirements.txt  && \
    # zsh option
    chsh -s /bin/zsh  && \
    sh -c "$(wget -O- https://raw.githubusercontent.com/ohmyzsh/ohmyzsh/master/tools/install.sh)"  && \
    # add zsh-autosuggestions, zsh-syntax-highlighting plugin
    git clone https://github.com/zsh-users/zsh-autosuggestions ~/.oh-my-zsh/custom/plugins/zsh-autosuggestions  && \
    git clone https://github.com/zsh-users/zsh-syntax-highlighting.git ~/.oh-my-zsh/custom/plugins/zsh-syntax-highlighting  && \
    # Modify .zshrc whth Perl
    perl -pi -w -e 's/ZSH_THEME=.*/ZSH_THEME="af-magic"/g;' ~/.zshrc  && \
    perl -pi -w -e 's/plugins=.*/plugins=(git ssh-agent zsh-autosuggestions zsh-syntax-highlighting)/g;' ~/.zshrc  && \
    # Set ssh id and password, default is id = root, password = root.
    # I recommand changing this for more security
    # PermitRootLogin : yes - for ssh connection
    echo 'root:root' |chpasswd  && \
    sed -ri 's/^#?PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config  && \
    sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config  && \
    mkdir /root/.ssh  && \
    mkdir /var/run/sshd   && \
    # install language pack for timeline issue.
    apt-get install -y language-pack-en && update-locale  && \
    # Clean up
    apt-get clean  && \
    apt-get autoclean  && \
    apt-get autoremove -y  && \
    rm -rf /var/lib/cache/*  && \
    rm -rf /var/lib/log/*
 WORKDIR /workspace
 CMD ["echo", "nvidia/cudagl:11.3.1-devel-ubuntu20.04 is ready!", 'zsh']
--- a/emotalk_own/LICENSE
+++ b/emotalk_own/LICENSE
@@ -0,0 +1,13 @@
 Copyright (c) 2023 Psyche AI Inc.
 This work is licensed under the Creative Commons Attribution-NonCommercial 4.0 International License (CC BY-NC 4.0). To view a copy of this license, visit http://creativecommons.org/licenses/by-nc/4.0/ or send a letter to Creative Commons, PO Box 1866, Mountain View, CA 94042, USA.
 Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, and distribute the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
 1. Attribution — You must give appropriate credit, provide a link to the license, and indicate if changes were made. You may do so in any reasonable manner, but not in any way that suggests the licensor endorses you or your use.
 2. NonCommercial — You may not use the material for commercial purposes.
 3. No additional restrictions — You may not apply legal terms or technological measures that legally restrict others from doing anything the license permits.
 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/emotalk_own/blender.sh
+++ b/emotalk_own/blender.sh
@@ -0,0 +1,4 @@
 wget https://ftp.nluug.nl/pub/graphics/blender/release/Blender3.4/blender-3.4.1-linux-x64.tar.xz
 tar -xf blender-3.4.1-linux-x64.tar.xz
 mv blender-3.4.1-linux-x64 blender && rm blender-3.4.1-linux-x64.tar.xz
--- a/emotalk_own/demo.py
+++ b/emotalk_own/demo.py
@@ -0,0 +1,111 @@
 import librosa
 import numpy as np
 import argparse
 from scipy.signal import savgol_filter
 import torch
 from model import EmoTalk
 import random
 import os, subprocess
 import shlex
@torch.no_grad()
 def test(args):
    result_path = args.result_path
    os.makedirs(result_path, exist_ok=True)
    eye1 = np.array([0.36537236, 0.950235724, 0.95593375, 0.916715622, 0.367256105, 0.119113259, 0.025357503])
    eye2 = np.array([0.234776169, 0.909951985, 0.944758058, 0.777862132, 0.191071674, 0.235437036, 0.089163929])
    eye3 = np.array([0.870040774, 0.949833691, 0.949418545, 0.695911646, 0.191071674, 0.072576277, 0.007108896])
    eye4 = np.array([0.000307991, 0.556701422, 0.952656746, 0.942345619, 0.425857186, 0.148335218, 0.017659493])
    model = EmoTalk(args)
    model.load_state_dict(torch.load(args.model_path, map_location=torch.device(args.device)), strict=False)
    model = model.to(args.device)
    model.eval()
    wav_path = args.wav_path
    file_name = wav_path.split('/')[-1].split('.')[0]
    speech_array, sampling_rate = librosa.load(os.path.join(wav_path), sr=16000)
    audio = torch.FloatTensor(speech_array).unsqueeze(0).to(args.device)
    level = torch.tensor([1]).to(args.device)
    person = torch.tensor([0]).to(args.device)
    prediction = model.predict(audio, level, person)
    prediction = prediction.squeeze().detach().cpu().numpy()
    if args.post_processing:
        output = np.zeros((prediction.shape[0], prediction.shape[1]))
        for i in range(prediction.shape[1]):
            output[:, i] = savgol_filter(prediction[:, i], 5, 2)
        output[:, 8] = 0
        output[:, 9] = 0
        i = random.randint(0, 60)
        while i < output.shape[0] - 7:
            eye_num = random.randint(1, 4)
            if eye_num == 1:
                output[i:i + 7, 8] = eye1
                output[i:i + 7, 9] = eye1
            elif eye_num == 2:
                output[i:i + 7, 8] = eye2
                output[i:i + 7, 9] = eye2
            elif eye_num == 3:
                output[i:i + 7, 8] = eye3
                output[i:i + 7, 9] = eye3
            else:
                output[i:i + 7, 8] = eye4
                output[i:i + 7, 9] = eye4
            time1 = random.randint(60, 180)
            i = i + time1
        np.save(os.path.join(result_path, "{}.npy".format(file_name)), output)  # with postprocessing (smoothing and blinking)
    else:
        np.save(os.path.join(result_path, "{}.npy".format(file_name)), prediction)  # without post-processing
 def render_video(args):
    wav_name = args.wav_path.split('/')[-1].split('.')[0]
    image_path = os.path.join(args.result_path, wav_name)
    os.makedirs(image_path, exist_ok=True)
    image_temp = image_path + "/%d.png"
    output_path = os.path.join(args.result_path, wav_name + ".mp4")
    blender_path = args.blender_path
    python_path = "./render.py"
    blend_path = "./render.blend"
    cmd = '{} -t 64 -b {} -P {} -- "{}" "{}" '.format(blender_path, blend_path, python_path, args.result_path, wav_name)
    cmd = shlex.split(cmd)
    p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
    while p.poll() is None:
        line = p.stdout.readline()
        line = line.strip()
        if line:
            print('[{}]'.format(line))
    if p.returncode == 0:
        print('Subprogram success')
    else:
        print('Subprogram failed')
    cmd = 'ffmpeg -r 30 -i "{}" -i "{}" -pix_fmt yuv420p -s 512x768 "{}" -y'.format(image_temp, args.wav_path, output_path)
    subprocess.call(cmd, shell=True)
    cmd = 'rm -rf "{}"'.format(image_path)
    subprocess.call(cmd, shell=True)
 def main():
    parser = argparse.ArgumentParser(
        description='EmoTalk: Speech-driven Emotional Disentanglement for 3D Face Animation')
    parser.add_argument("--wav_path", type=str, default="./audio/angry1.wav", help='path of the test data')
    parser.add_argument("--bs_dim", type=int, default=52, help='number of blendshapes:52')
    parser.add_argument("--feature_dim", type=int, default=832, help='number of feature dim')
    parser.add_argument("--period", type=int, default=30, help='number of period')
    parser.add_argument("--device", type=str, default="cuda", help='device')
    parser.add_argument("--model_path", type=str, default="./pretrain_model/EmoTalk.pth",
                        help='path of the trained models')
    parser.add_argument("--result_path", type=str, default="./result/", help='path of the result')
    parser.add_argument("--max_seq_len", type=int, default=5000, help='max sequence length')
    parser.add_argument("--num_workers", type=int, default=0)
    parser.add_argument("--batch_size", type=int, default=1)
    parser.add_argument("--post_processing", type=bool, default=True, help='whether to use post processing')
    parser.add_argument("--blender_path", type=str, default="./blender/blender", help='path of blender')
    args = parser.parse_args()
    test(args)
    render_video(args)
 if __name__ == "__main__":
    main()
--- a/emotalk_own/model.py
+++ b/emotalk_own/model.py
@@ -0,0 +1,144 @@
 import torch
 import torch.nn as nn
 import numpy as np
 import math
 from transformers import Wav2Vec2Processor, Wav2Vec2FeatureExtractor
 from wav2vec import Wav2Vec2Model, Wav2Vec2ForSpeechClassification
 from utils import init_biased_mask, enc_dec_mask
 class EmoTalk(nn.Module):
    def __init__(self, args):
        super(EmoTalk, self).__init__()
        self.feature_dim = args.feature_dim
        self.bs_dim = args.bs_dim
        self.device = args.device
        self.batch_size = args.batch_size
        self.audio_encoder_cont = Wav2Vec2Model.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
        self.processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english")
        self.audio_encoder_cont.feature_extractor._freeze_parameters()
        self.audio_encoder_emo = Wav2Vec2ForSpeechClassification.from_pretrained(
            "r-f/wav2vec-english-speech-emotion-recognition")
        self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(
            "r-f/wav2vec-english-speech-emotion-recognition")
        self.audio_encoder_emo.wav2vec2.feature_extractor._freeze_parameters()
        self.max_seq_len = args.max_seq_len
        self.audio_feature_map_cont = nn.Linear(1024, 512)
        self.audio_feature_map_emo = nn.Linear(1024, 832)
        self.audio_feature_map_emo2 = nn.Linear(832, 256)
        self.relu = nn.ReLU()
        self.biased_mask1 = init_biased_mask(n_head=4, max_seq_len=args.max_seq_len, period=args.period)
        self.one_hot_level = np.eye(2)
        self.obj_vector_level = nn.Linear(2, 32)
        self.one_hot_person = np.eye(24)
        self.obj_vector_person = nn.Linear(24, 32)
        decoder_layer = nn.TransformerDecoderLayer(d_model=args.feature_dim, nhead=4, dim_feedforward=args.feature_dim,
                                                   batch_first=True)
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer, num_layers=1)
        self.bs_map_r = nn.Linear(self.feature_dim, self.bs_dim)
        nn.init.constant_(self.bs_map_r.weight, 0)
        nn.init.constant_(self.bs_map_r.bias, 0)
    def forward(self, data):
        frame_num11 = data["target11"].shape[1]
        frame_num12 = data["target12"].shape[1]
        inputs12 = self.processor(torch.squeeze(data["input12"]), sampling_rate=16000, return_tensors="pt",
                                  padding="longest").input_values.to(self.device)
        hidden_states_cont1 = self.audio_encoder_cont(inputs12, frame_num=frame_num11).last_hidden_state
        hidden_states_cont12 = self.audio_encoder_cont(inputs12, frame_num=frame_num12).last_hidden_state
        inputs21 = self.feature_extractor(torch.squeeze(data["input21"]), sampling_rate=16000, padding=True,
                                          return_tensors="pt").input_values.to(self.device)
        inputs12 = self.feature_extractor(torch.squeeze(data["input12"]), sampling_rate=16000, padding=True,
                                          return_tensors="pt").input_values.to(self.device)
        output_emo1 = self.audio_encoder_emo(inputs21, frame_num=frame_num11)
        output_emo2 = self.audio_encoder_emo(inputs12, frame_num=frame_num12)
        hidden_states_emo1 = output_emo1.hidden_states
        hidden_states_emo2 = output_emo2.hidden_states
        label1 = output_emo1.logits
        onehot_level = self.one_hot_level[data["level"]]
        onehot_level = torch.from_numpy(onehot_level).to(self.device).float()
        onehot_person = self.one_hot_person[data["person"]]
        onehot_person = torch.from_numpy(onehot_person).to(self.device).float()
        if data["target11"].shape[0] == 1:
            obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0)
            obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0)
        else:
            obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0).permute(1, 0, 2)
            obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0).permute(1, 0, 2)
        obj_embedding_level11 = obj_embedding_level.repeat(1, frame_num11, 1)
        obj_embedding_level12 = obj_embedding_level.repeat(1, frame_num12, 1)
        obj_embedding_person11 = obj_embedding_person.repeat(1, frame_num11, 1)
        obj_embedding_person12 = obj_embedding_person.repeat(1, frame_num12, 1)
        hidden_states_cont1 = self.audio_feature_map_cont(hidden_states_cont1)
        hidden_states_emo11_832 = self.audio_feature_map_emo(hidden_states_emo1)
        hidden_states_emo11_256 = self.relu(self.audio_feature_map_emo2(hidden_states_emo11_832))
        hidden_states11 = torch.cat(
            [hidden_states_cont1, hidden_states_emo11_256, obj_embedding_level11, obj_embedding_person11], dim=2)
        hidden_states_cont12 = self.audio_feature_map_cont(hidden_states_cont12)
        hidden_states_emo12_832 = self.audio_feature_map_emo(hidden_states_emo2)
        hidden_states_emo12_256 = self.relu(self.audio_feature_map_emo2(hidden_states_emo12_832))
        hidden_states12 = torch.cat(
            [hidden_states_cont12, hidden_states_emo12_256, obj_embedding_level12, obj_embedding_person12], dim=2)
        if data["target11"].shape[0] == 1:
            tgt_mask11 = self.biased_mask1[:, :hidden_states11.shape[1], :hidden_states11.shape[1]].clone().detach().to(
                device=self.device)
            tgt_mask22 = self.biased_mask1[:, :hidden_states12.shape[1], :hidden_states12.shape[1]].clone().detach().to(
                device=self.device)
        memory_mask11 = enc_dec_mask(self.device, hidden_states11.shape[1], hidden_states11.shape[1])
        memory_mask12 = enc_dec_mask(self.device, hidden_states12.shape[1], hidden_states12.shape[1])
        bs_out11 = self.transformer_decoder(hidden_states11, hidden_states_emo11_832, tgt_mask=tgt_mask11,
                                            memory_mask=memory_mask11)
        bs_out12 = self.transformer_decoder(hidden_states12, hidden_states_emo12_832, tgt_mask=tgt_mask22,
                                            memory_mask=memory_mask12)
        bs_output11 = self.bs_map_r(bs_out11)
        bs_output12 = self.bs_map_r(bs_out12)
        return bs_output11, bs_output12, label1
    def predict(self, audio, level, person):
        frame_num11 = math.ceil(audio.shape[1] / 16000 * 30)
        inputs12 = self.processor(torch.squeeze(audio), sampling_rate=16000, return_tensors="pt",
                                  padding="longest").input_values.to(self.device)
        hidden_states_cont1 = self.audio_encoder_cont(inputs12, frame_num=frame_num11).last_hidden_state
        inputs12 = self.feature_extractor(torch.squeeze(audio), sampling_rate=16000, padding=True,
                                          return_tensors="pt").input_values.to(self.device)
        output_emo1 = self.audio_encoder_emo(inputs12, frame_num=frame_num11)
        hidden_states_emo1 = output_emo1.hidden_states
        onehot_level = self.one_hot_level[level]
        onehot_level = torch.from_numpy(onehot_level).to(self.device).float()
        onehot_person = self.one_hot_person[person]
        onehot_person = torch.from_numpy(onehot_person).to(self.device).float()
        if audio.shape[0] == 1:
            obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0)
            obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0)
        else:
            obj_embedding_level = self.obj_vector_level(onehot_level).unsqueeze(0).permute(1, 0, 2)
            obj_embedding_person = self.obj_vector_person(onehot_person).unsqueeze(0).permute(1, 0, 2)
        obj_embedding_level11 = obj_embedding_level.repeat(1, frame_num11, 1)
        obj_embedding_person11 = obj_embedding_person.repeat(1, frame_num11, 1)
        hidden_states_cont1 = self.audio_feature_map_cont(hidden_states_cont1)
        hidden_states_emo11_832 = self.audio_feature_map_emo(hidden_states_emo1)
        hidden_states_emo11_256 = self.relu(
            self.audio_feature_map_emo2(hidden_states_emo11_832))
        hidden_states11 = torch.cat(
            [hidden_states_cont1, hidden_states_emo11_256, obj_embedding_level11, obj_embedding_person11], dim=2)
        if audio.shape[0] == 1:
            tgt_mask11 = self.biased_mask1[:, :hidden_states11.shape[1],
                         :hidden_states11.shape[1]].clone().detach().to(device=self.device)
        memory_mask11 = enc_dec_mask(self.device, hidden_states11.shape[1], hidden_states11.shape[1])
        bs_out11 = self.transformer_decoder(hidden_states11, hidden_states_emo11_832, tgt_mask=tgt_mask11,
                                            memory_mask=memory_mask11)
        bs_output11 = self.bs_map_r(bs_out11)
        return bs_output11
--- a/emotalk_own/readme.md
+++ b/emotalk_own/readme.md
@@ -0,0 +1,103 @@
 ![Psyche AI Inc release](./media/psy_logo.png)
 # EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation [ICCV2023]
 Official PyTorch implementation for the paper:
 > **EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation**, ***ICCV 2023***.
 >
 > Ziqiao Peng, Haoyu Wu, Zhenbo Song, Hao Xu, Xiangyu Zhu, Jun He, Hongyan Liu, Zhaoxin Fan
 >
 >  [Arxiv](https://arxiv.org/abs/2303.11089) | [Project Page](https://ziqiaopeng.github.io/emotalk/) | [License](https://github.com/psyai-net/EmoTalk_release/blob/main/LICENSE)
 <p align="center">
 <img src="./media/emotalk.png" width="90%" />
 </p>
 > Given audio input expressing different emotions, EmoTalk produces realistic 3D facial animation sequences with corresponding emotional expressions as outputs.
 ## News
 - `2023.10.17` Thanks to [noirmist](https://github.com/noirmist)! Now you can create the environment via docker.
 ## Environment
 - Linux
 - Python 3.8.8
 - Pytorch 1.12.1
 - CUDA 11.3
 - Blender 3.4.1
 - ffmpeg 4.4.1
 Clone the repo:
  ```bash
  git clone https://github.com/psyai-net/EmoTalk_release.git
  cd EmoTalk_release
  ```  
 Create conda environment:
 ```bash
 conda create -n emotalk python=3.8.8
 conda activate emotalk
 pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113
 pip install -r requirements.txt
 ```
 ## **Demo**
 Download Blender and put it in this directory.
 ```bash
 wget https://ftp.nluug.nl/pub/graphics/blender/release/Blender3.4/blender-3.4.1-linux-x64.tar.xz
 tar -xf blender-3.4.1-linux-x64.tar.xz
 mv blender-3.4.1-linux-x64 blender && rm blender-3.4.1-linux-x64.tar.xz
 ```
 Download the pretrained models from [EmoTalk.pth](https://drive.google.com/file/d/1KQZ-WGI9VDFLqgNXvJQosKVCbjTaCPqK/view?usp=drive_link) (Updated). Put the pretrained models under `pretrain_model` folder. 
 Put the audio under `aduio` folder and run
 ```bash
 python demo.py --wav_path "./audio/disgust.wav"
 ```
 The generated animation will be saved in `result` folder.
 ## **Dataset**
 If someone wants to download the 3D-ETF dataset, please fill in the [agreement](https://drive.google.com/file/d/1AQ5_focSgw9WiJdA2R44BQOrdTUe2ABd/view?usp=drive_link), and use the education mailbox to email Ziqiao Peng (pengziqiao@ruc.edu.cn) and cc Zhaoxin Fan (fanzhaoxin@psyai.net) to request the download link.
 ## **Citation**
 If you find this work useful for your research, please cite our paper:
 ```
@InProceedings{Peng_2023_ICCV,
    author    = {Peng, Ziqiao and Wu, Haoyu and Song, Zhenbo and Xu, Hao and Zhu, Xiangyu and He, Jun and Liu, Hongyan and Fan, Zhaoxin},
    title     = {EmoTalk: Speech-Driven Emotional Disentanglement for 3D Face Animation},
    booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
    month     = {October},
    year      = {2023},
    pages     = {20687-20697}
 }
 ```
 ## **Acknowledgement**
 Here are some great resources we benefit:
 - [Faceformer](https://github.com/EvelynFan/FaceFormer) for training pipeline
 - [EVP](https://github.com/jixinya/EVP) for training dataloader
 - [Speech-driven-expressions](https://github.com/YoungSeng/Speech-driven-expressions) for rendering
 - [Wav2Vec2 Content](https://huggingface.co/jonatasgrosman/wav2vec2-large-xlsr-53-english) and [Wav2Vec2 Emotion](https://huggingface.co/r-f/wav2vec-english-speech-emotion-recognition) for audio encoder
 - [Head Template](http://filmicworlds.com/blog/solving-face-scans-for-arkit/) for visualization.
 Thanks to John Hable for sharing his head template under the CC0 license, which is very helpful for us to visualize the results.
 ## **Contact**
 For research purpose, such as comparison of experimental results, please contact pengziqiao@ruc.edu.cn
 For commercial licensing, please contact fanzhaoxin@psyai.net
 ## **License**
 This project is licensed under the Creative Commons Attribution-NonCommercial 4.0 International License. Please read the [LICENSE](LICENSE) file for more information.
 ## **Invitation**
 We invite you to join [Psyche AI Inc](https://www.psyai.com/home) to conduct cutting-edge research and business implementation together. At Psyche AI Inc, we are committed to pushing the boundaries of what's possible in the fields of artificial intelligence and computer vision, especially their applications in avatars. As a member of our team, you will have the opportunity to collaborate with talented individuals, innovate new ideas, and contribute to projects that have a real-world impact.
 If you are passionate about working on the forefront of technology and making a difference, we would love to hear from you. Please visit our website at [Psyche AI Inc](https://www.psyai.com/home) to learn more about us and to apply for open positions. You can also contact us by fanzhaoxin@psyai.net.
 Let's shape the future together!!
--- a/emotalk_own/render.py
+++ b/emotalk_own/render.py
@@ -0,0 +1,87 @@
 import bpy
 import os
 import numpy as np
 import sys
 filename = str(sys.argv[-1])
 root_dir = str(sys.argv[-2])
 model_bsList = ["browDownLeft",
                "browDownRight",
                "browInnerUp",
                "browOuterUpLeft",
                "browOuterUpRight",
                "cheekPuff",
                "cheekSquintLeft",
                "cheekSquintRight",
                "eyeBlinkLeft",
                "eyeBlinkRight",
                "eyeLookDownLeft",
                "eyeLookDownRight",
                "eyeLookInLeft",
                "eyeLookInRight",
                "eyeLookOutLeft",
                "eyeLookOutRight",
                "eyeLookUpLeft",
                "eyeLookUpRight",
                "eyeSquintLeft",
                "eyeSquintRight",
                "eyeWideLeft",
                "eyeWideRight",
                "jawForward",
                "jawLeft",
                "jawOpen",
                "jawRight",
                "mouthClose",
                "mouthDimpleLeft",
                "mouthDimpleRight",
                "mouthFrownLeft",
                "mouthFrownRight",
                "mouthFunnel",
                "mouthLeft",
                "mouthLowerDownLeft",
                "mouthLowerDownRight",
                "mouthPressLeft",
                "mouthPressRight",
                "mouthPucker",
                "mouthRight",
                "mouthRollLower",
                "mouthRollUpper",
                "mouthShrugLower",
                "mouthShrugUpper",
                "mouthSmileLeft",
                "mouthSmileRight",
                "mouthStretchLeft",
                "mouthStretchRight",
                "mouthUpperUpLeft",
                "mouthUpperUpRight",
                "noseSneerLeft",
                "noseSneerRight",
                "tongueOut"]
 obj = bpy.data.objects["face"]
 bpy.context.scene.render.engine = 'BLENDER_WORKBENCH'
 bpy.context.scene.display.shading.light = 'MATCAP'
 bpy.context.scene.display.render_aa = 'FXAA'
 bpy.context.scene.render.resolution_x = int(512)
 bpy.context.scene.render.resolution_y = int(768)
 bpy.context.scene.render.fps = 30
 bpy.context.scene.render.image_settings.file_format = 'PNG'
 cam = bpy.data.objects['Camera']
 cam.scale = [2, 2, 2]
 bpy.context.scene.camera = cam
 output_dir = root_dir + filename
 blendshape_path = root_dir + filename + '.npy'
 result = []
 bs = np.load(blendshape_path)
 for i in range(bs.shape[0]):
    curr_bs = bs[i]
    for j in range(52):
        obj.data.shape_keys.key_blocks[model_bsList[j]].value = curr_bs[j]
    bpy.context.scene.render.filepath = os.path.join(output_dir, '{}.png'.format(i))
    bpy.ops.render.render(write_still=True)
--- a/emotalk_own/requirements.txt
+++ b/emotalk_own/requirements.txt
@@ -0,0 +1,5 @@
 numpy~=1.21.6
 transformers~=4.26.0
 tqdm~=4.64.1
 librosa~=0.10.0
 scipy~=1.9.1
--- a/emotalk_own/utils.py
+++ b/emotalk_own/utils.py
@@ -0,0 +1,39 @@
 # Borrowed from https://github.com/EvelynFan/FaceFormer/blob/main/faceformer.py
 import torch
 import math
 # Temporal Bias
 def init_biased_mask(n_head, max_seq_len, period):
    def get_slopes(n):
        def get_slopes_power_of_2(n):
            start = (2 ** (-2 ** -(math.log2(n) - 3)))
            ratio = start
            return [start * ratio ** i for i in range(n)]
        if math.log2(n).is_integer():
            return get_slopes_power_of_2(n)
        else:
            closest_power_of_2 = 2 ** math.floor(math.log2(n))
            return get_slopes_power_of_2(closest_power_of_2) + get_slopes(2 * closest_power_of_2)[0::2][
                                                               :n - closest_power_of_2]
    slopes = torch.Tensor(get_slopes(n_head))
    bias = torch.arange(start=0, end=max_seq_len, step=period).unsqueeze(1).repeat(1, period).view(-1) // (period)
    bias = - torch.flip(bias, dims=[0])
    alibi = torch.zeros(max_seq_len, max_seq_len)
    for i in range(max_seq_len):
        alibi[i, :i + 1] = bias[-(i + 1):]
    alibi = slopes.unsqueeze(1).unsqueeze(1) * alibi.unsqueeze(0)
    mask = (torch.triu(torch.ones(max_seq_len, max_seq_len)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    mask = mask.unsqueeze(0) + alibi
    return mask
 # Alignment Bias
 def enc_dec_mask(device, T, S):
    mask = torch.ones(T, S).to(device)
    for i in range(T):
        mask[i, i] = 0
    return (mask == 1).to(device=device)
--- a/emotalk_own/wav2vec.py
+++ b/emotalk_own/wav2vec.py
@@ -0,0 +1,245 @@
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import numpy as np
 from dataclasses import dataclass
 from transformers import Wav2Vec2Model, Wav2Vec2PreTrainedModel
 from transformers.modeling_outputs import BaseModelOutput
 from typing import Optional, Tuple
 from transformers.file_utils import ModelOutput
 from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
 _CONFIG_FOR_DOC = "Wav2Vec2Config"
 _HIDDEN_STATES_START_POSITION = 2
 # the implementation of Wav2Vec2Model is borrowed from https://huggingface.co/transformers/_modules/transformers/models/wav2vec2/modeling_wav2vec2.html#Wav2Vec2Model
 # initialize our encoder with the pre-trained wav2vec 2.0 weights.
 def _compute_mask_indices(
        shape: Tuple[int, int],
        mask_prob: float,
        mask_length: int,
        attention_mask: Optional[torch.Tensor] = None,
        min_masks: int = 0,
 ) -> np.ndarray:
    bsz, all_sz = shape
    mask = np.full((bsz, all_sz), False)
    all_num_mask = int(
        mask_prob * all_sz / float(mask_length)
        + np.random.rand()
    )
    all_num_mask = max(min_masks, all_num_mask)
    mask_idcs = []
    padding_mask = attention_mask.ne(1) if attention_mask is not None else None
    for i in range(bsz):
        if padding_mask is not None:
            sz = all_sz - padding_mask[i].long().sum().item()
            num_mask = int(
                mask_prob * sz / float(mask_length)
                + np.random.rand()
            )
            num_mask = max(min_masks, num_mask)
        else:
            sz = all_sz
            num_mask = all_num_mask
        lengths = np.full(num_mask, mask_length)
        if sum(lengths) == 0:
            lengths[0] = min(mask_length, sz - 1)
        min_len = min(lengths)
        if sz - min_len <= num_mask:
            min_len = sz - num_mask - 1
        mask_idc = np.random.choice(sz - min_len, num_mask, replace=False)
        mask_idc = np.asarray([mask_idc[j] + offset for j in range(len(mask_idc)) for offset in range(lengths[j])])
        mask_idcs.append(np.unique(mask_idc[mask_idc < sz]))
    min_len = min([len(m) for m in mask_idcs])
    for i, mask_idc in enumerate(mask_idcs):
        if len(mask_idc) > min_len:
            mask_idc = np.random.choice(mask_idc, min_len, replace=False)
        mask[i, mask_idc] = True
    return mask
 # linear interpolation layer
 def linear_interpolation(features, input_fps, output_fps, output_len=None):
    features = features.transpose(1, 2)
    seq_len = features.shape[2] / float(input_fps)
    if output_len is None:
        output_len = int(seq_len * output_fps)
    output_features = F.interpolate(features, size=output_len, align_corners=True, mode='linear')
    return output_features.transpose(1, 2)
 class Wav2Vec2Model(Wav2Vec2Model):
    def __init__(self, config):
        super().__init__(config)
        self.lm_head = nn.Linear(1024, 32)
    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            frame_num=None
    ):
        self.config.output_attentions = True
        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
        output_hidden_states = (
            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
        )
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        hidden_states = self.feature_extractor(input_values)
        hidden_states = hidden_states.transpose(1, 2)
        hidden_states = linear_interpolation(hidden_states, 50, 30, output_len=frame_num)
        if attention_mask is not None:
            output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
            attention_mask = torch.zeros(
                hidden_states.shape[:2], dtype=hidden_states.dtype, device=hidden_states.device
            )
            attention_mask[
                (torch.arange(attention_mask.shape[0], device=hidden_states.device), output_lengths - 1)
            ] = 1
            attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
        hidden_states = self.feature_projection(hidden_states)[0]
        encoder_outputs = self.encoder(
            hidden_states,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = encoder_outputs[0]
        if not return_dict:
            return (hidden_states,) + encoder_outputs[1:]
        return BaseModelOutput(
            last_hidden_state=hidden_states,
            hidden_states=encoder_outputs.hidden_states,
            attentions=encoder_outputs.attentions,
        )
@dataclass
 class SpeechClassifierOutput(ModelOutput):
    loss: Optional[torch.FloatTensor] = None
    logits: torch.FloatTensor = None
    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
    attentions: Optional[Tuple[torch.FloatTensor]] = None
 class Wav2Vec2ClassificationHead(nn.Module):
    """Head for wav2vec classification task."""
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
        self.dropout = nn.Dropout(config.final_dropout)
        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
    def forward(self, features, **kwargs):
        x = features
        x = self.dropout(x)
        x = self.dense(x)
        x = torch.tanh(x)
        x = self.dropout(x)
        x = self.out_proj(x)
        return x
 class Wav2Vec2ForSpeechClassification(Wav2Vec2PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = config.num_labels
        self.pooling_mode = config.pooling_mode
        self.config = config
        self.wav2vec2 = Wav2Vec2Model(config)
        self.classifier = Wav2Vec2ClassificationHead(config)
        self.init_weights()
    def freeze_feature_extractor(self):
        self.wav2vec2.feature_extractor._freeze_parameters()
    def merged_strategy(
            self,
            hidden_states,
            mode="mean"
    ):
        if mode == "mean":
            outputs = torch.mean(hidden_states, dim=1)
        elif mode == "sum":
            outputs = torch.sum(hidden_states, dim=1)
        elif mode == "max":
            outputs = torch.max(hidden_states, dim=1)[0]
        else:
            raise Exception(
                "The pooling method hasn't been defined! Your pooling mode must be one of these ['mean', 'sum', 'max']")
        return outputs
    def forward(
            self,
            input_values,
            attention_mask=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
            labels=None,
            frame_num=None,
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.wav2vec2(
            input_values,
            attention_mask=attention_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )
        hidden_states = outputs[0]
        hidden_states1 = linear_interpolation(hidden_states, 50, 30, output_len=frame_num)
        hidden_states = self.merged_strategy(hidden_states1, mode=self.pooling_mode)
        logits = self.classifier(hidden_states)
        loss = None
        if labels is not None:
            if self.config.problem_type is None:
                if self.num_labels == 1:
                    self.config.problem_type = "regression"
                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
                    self.config.problem_type = "single_label_classification"
                else:
                    self.config.problem_type = "multi_label_classification"
            if self.config.problem_type == "regression":
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels)
            elif self.config.problem_type == "single_label_classification":
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            elif self.config.problem_type == "multi_label_classification":
                loss_fct = BCEWithLogitsLoss()
                loss = loss_fct(logits, labels)
        if not return_dict:
            output = (logits,) + outputs[2:]
            return ((loss,) + output) if loss is not None else output
        return SpeechClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=hidden_states1,
            attentions=outputs.attentions,
        )
--- a/main.py
+++ b/main.py
@@ -0,0 +1,4 @@
 from piedemo.web import Web
 from piedemo.fields.ajax_group import AjaxChatField
--- a/miapia_own/FemAdv_b350_V2_050523.py
+++ b/miapia_own/FemAdv_b350_V2_050523.py
@@ -0,0 +1,164 @@
 import random
 import shutil
 model_bsList = ["browDownLeft",
                "browDownRight",
                "browInnerUp",
                "browOuterUpLeft",
                "browOuterUpRight",
                "cheekPuff",
                "cheekSquintLeft",
                "cheekSquintRight",
                "eyeBlinkLeft",
                "eyeBlinkRight",
                "eyeLookDownLeft",
                "eyeLookDownRight",
                "eyeLookInLeft",
                "eyeLookInRight",
                "eyeLookOutLeft",
                "eyeLookOutRight",
                "eyeLookUpLeft",
                "eyeLookUpRight",
                "eyeSquintLeft",
                "eyeSquintRight",
                "eyeWideLeft",
                "eyeWideRight",
                "jawForward",
                "jawLeft",
                "jawOpen",
                "jawRight",
                "mouthClose",
                "mouthDimpleLeft",
                "mouthDimpleRight",
                "mouthFrownLeft",
                "mouthFrownRight",
                "mouthFunnel",
                "mouthLeft",
                "mouthLowerDownLeft",
                "mouthLowerDownRight",
                "mouthPressLeft",
                "mouthPressRight",
                "mouthPucker",
                "mouthRight",
                "mouthRollLower",
                "mouthRollUpper",
                "mouthShrugLower",
                "mouthShrugUpper",
                "mouthSmileLeft",
                "mouthSmileRight",
                "mouthStretchLeft",
                "mouthStretchRight",
                "mouthUpperUpLeft",
                "mouthUpperUpRight",
                "noseSneerLeft",
                "noseSneerRight",
                "tongueOut"]
 import bpy
 import os
 import numpy as np
 import sys
 filename = str(sys.argv[-1])
 root_dir = str(sys.argv[-2])
 object_name = "MFA_body"
 obj = bpy.data.objects[object_name]
 bpy.context.scene.render.engine = 'BLENDER_WORKBENCH'
 bpy.context.scene.display.shading.light = 'MATCAP'
 bpy.context.scene.display.render_aa = 'FXAA'
 bpy.context.scene.render.resolution_x = int(512)
 bpy.context.scene.render.resolution_y = int(768)
 bpy.context.scene.render.fps = 30
 bpy.context.scene.render.image_settings.file_format = 'PNG'
 cam = bpy.data.objects['0Camera']
 cam.scale = [2, 2, 2]
 bpy.context.scene.camera = cam
 """
 model_bsList = ['Basis',
                '0',
                'X_postrig',
                'X_neck',
                'X_head',
                'X_eyesfix',
                'X_breast',
                'X_nails',
                'X_pus_conf.1',
                'X_pus_assym', 'X_jadafication',
                'X_facetweak', 'X_eyeshape',
                'A_nipple_in', 'A_nailsmax',
                'A_pregnant', 'PAD_breathe',
                'PAD_swallow', 'Head',
                'cr_neck1', 'cr_neck2',
                'cr_neck3.R', 'cr_neck3.L',
                'cr_neck4.L', 'cr_neck4.R', 'cr_jaw1', 'cr_jaw2', 'sqz_jaw3', 'cr_brows_dwn', 'cr_brows_up',
                'cr_eye_lookdown', 'cr_eye_open',
                'cr_eye_look.L', 'cr_eye_look.R', 'cr_mouthmax.L', 'cr_mouthmax.R', 'cr_cheekin.L', 'cr_cheekin.R', 'Body', 'cr_spine',
                'cr_spine2', 'cr_spine3', 'cr_spine2.L',
                'cr_spine2.R', 'cr_spine4.L', 'cr_spine4.R',
                'cr_spine5.L', 'cr_spine5.R', 'cr_lowerspine.bcw',
                'cr_lowerspine.fwd', 'size_breastXL.L', 'size_breastXL.R',
                'size_breastXS.L', 'size_breastXS.R', 'size_oreola.L',
                'size_oreola.R', 'Legs', 'cr_hipout.L', 'cr_hipout.R',
                'cr_hipin.L', 'cr_hipin.R', 'cr_pussyflattern',
                'cr_hip0.L', 'cr_hip0.R', 'cr_hip1.L', 'cr_hip1.R',
                'cr_hip45.L', 'cr_hip45.R', 'sqz_hip1max.L',
                'sqz_hip1max.R', 'sqz_hip1vol.L', 'sqz_hip1vol.R',
                'sqz_hip1squeeze.L', 'sqz_hip1squeeze.R', 'cr_hip2.L',
                'cr_hip2.R', 'sqz_hip2.L', 'sqz_hip2.R', 'cr_hip3.L',
                'cr_hip3.R', 'sqz_buttrest.L', 'sqz_buttrest.R',
                'cr_knee45.L', 'cr_knee45.R', 'cr_knee.L', 'cr_knee.R',
                'sqz_knee.L', 'sqz_knee.R', 'sqz_stance.L', 'sqz_stance.R',
                'cr_buttheart.L', 'cr_buttheart.R', 'rest_buttcheek.L',
                'rest_buttcheek.R', 'rest_knee.L', 'rest_knee.R', 'rest_knee_fat.L',
                'rest_knee_fat.R', 'rest_hip.L', 'rest_hip.R', 'vol_butt.L',
                'vol_butt.R', 'Feet', 'cr_feet1.L', 'cr_feet1.R', 'cr_feet2.L',
                'cr_feet2.R', 'cr_feet3.L', 'cr_feet3.R', 'cr_toe1.L', 'cr_toe1.R',
                'cr_toe2.L', 'cr_toe2.R', 'Arms', 'cr_arm-up.L', 'cr_arm-up.R',
                'cr_arm-fwd.L', 'cr_arm-fwd.R', 'cr_arm-dwn.L', 'cr_arm-dwn.R',
                'sqz_arm-fwd.L', 'sqz_arm-fwd.R', 'sqz_armpit.L', 'sqz_armpit.R',
                'sqz_arm-bcw.L', 'sqz_arm-bcw.R', 'sqz_arm-bcw_max.L',
                'sqz_arm-bcw_max.R', 'cr_arm-trc.L', 'cr_arm-trc.R',
                'D_cr_elbow.L', 'U_cr_elbow.L', 'D_cr_elbow.R', 'U_cr_elbow.R',
                'D_sqz_elbowMax.L', 'U_sqz_elbowMax.L', 'D_sqz_elbowMax.R',
                'U_sqz_elbowMax.R', 'cr_armrest.L', 'cr_armrest.R',
                'cr_shoulder_fwd.L', 'cr_shoulder_fwd.R', 'cr_shoulder_bcw.L',
                'cr_shoulder_bcw.R', 'cr_shoulder_dwn.L', 'cr_shoulder_dwn.R',
                'cr_shoulder_up.L', 'cr_shoulder_up.R', 'rest_elbow.L', 'rest_elbow.R',
                'Hands', 'cr_hand1.L', 'cr_hand1.R',
                'cr_hand2.L', 'cr_hand2.R', 'cr_handtwistU.L', 'cr_handtwistU.R',
                'cr_handtwistD.L',
                'cr_handtwistD.R',
                'cr_thumb.01.L', 'cr_thumb.01.R',
                'cr_f_index.01.L', 'cr_f_index.01.R', 'cr_f_index.02.L',
                'cr_f_index.02.R',
                'cr_f_middle.01.L', 'cr_f_middle.01.R', 'cr_f_middle.02.L',
                'cr_f_middle.02.R', 'cr_f_ring.01.L', 'cr_f_ring.01.R',
                'cr_f_ring.02.L', 'cr_f_ring.02.R', 'cr_f_pinky.01.L',
                'cr_f_pinky.01.R', 'cr_f_pinky.02.L', 'cr_f_pinky.02.R', 'EM',
                'em_eye_close.L', 'em_eye_close.R', 'em_eye_half.L', 'em_eye_half.R',
                'em_smile_open', 'em_smile_close', 'em_kiss', 'em_disg', 'em_blow',
                'em_surprise', 'em_sad', 'em_frown', 'PH', 'ph_+', 'ph_bpm',
                'ph_fv', 'ph_ou',
                'ph_e', 'ph_r', 'ph_ch', 'ph_th', 'ph_a']"""
 model_bsList = list(obj.data.shape_keys.key_blocks.keys())
 # print(obj.data.shape_keys.key_blocks.keys())
 output_dir = root_dir + filename
 blendshape_path = root_dir + filename + '.npy'
 result = []
 bs = np.load(blendshape_path)
 for i in range(10):
    for kp_name in model_bsList:
        obj.data.shape_keys.key_blocks[kp_name].value = random.random()
    bpy.context.scene.render.filepath = os.path.join(output_dir,
                                                     '{}.png'.format(i))
    bpy.ops.render.render(write_still=True)
--- a/miapia_own/init.py
+++ b/miapia_own/init.py
--- a/miapia_own/a.py
+++ b/miapia_own/a.py
@@ -0,0 +1,57 @@
 import bpy
 import os
 import numpy as np
 import sys
 filename = str(sys.argv[-1])
 root_dir = str(sys.argv[-2])
 object_name = "MFA_body"
 obj = bpy.data.objects[object_name]
 bpy.context.scene.render.engine = 'BLENDER_WORKBENCH'
 bpy.context.scene.display.shading.light = 'MATCAP'
 bpy.context.scene.display.render_aa = 'FXAA'
 bpy.context.scene.render.resolution_x = int(512)
 bpy.context.scene.render.resolution_y = int(768)
 bpy.context.scene.render.fps = 30
 bpy.context.scene.render.image_settings.file_format = 'PNG'
 cam = bpy.data.objects['0Camera']
 cam.scale = [2, 2, 2]
 bpy.context.scene.camera = cam
 model_bsList = ['Basis',
                '0',
                'X_postrig',
                'X_neck',
                'X_head',
                'X_eyesfix',
                'X_breast',
                'X_nails',
                'X_pus_conf.1',
                'X_pus_assym', 'X_jadafication',
                'X_facetweak', 'X_eyeshape',
                'A_nipple_in', 'A_nailsmax',
                'A_pregnant', 'PAD_breathe',
                'PAD_swallow', 'Head',
                'cr_neck1', 'cr_neck2',
                'cr_neck3.R', 'cr_neck3.L',
                'cr_neck4.L', 'cr_neck4.R', 'cr_jaw1', 'cr_jaw2', 'sqz_jaw3', 'cr_brows_dwn', 'cr_brows_up',
                'cr_eye_lookdown', 'cr_eye_open',
                'cr_eye_look.L', 'cr_eye_look.R', 'cr_mouthmax.L', 'cr_mouthmax.R', 'cr_cheekin.L', 'cr_cheekin.R', 'Body', 'cr_spine', 'cr_spine2', 'cr_spine3', 'cr_spine2.L', 'cr_spine2.R', 'cr_spine4.L', 'cr_spine4.R', 'cr_spine5.L', 'cr_spine5.R', 'cr_lowerspine.bcw', 'cr_lowerspine.fwd', 'size_breastXL.L', 'size_breastXL.R', 'size_breastXS.L', 'size_breastXS.R', 'size_oreola.L', 'size_oreola.R', 'Legs', 'cr_hipout.L', 'cr_hipout.R', 'cr_hipin.L', 'cr_hipin.R', 'cr_pussyflattern', 'cr_hip0.L', 'cr_hip0.R', 'cr_hip1.L', 'cr_hip1.R', 'cr_hip45.L', 'cr_hip45.R', 'sqz_hip1max.L', 'sqz_hip1max.R', 'sqz_hip1vol.L', 'sqz_hip1vol.R', 'sqz_hip1squeeze.L', 'sqz_hip1squeeze.R', 'cr_hip2.L', 'cr_hip2.R', 'sqz_hip2.L', 'sqz_hip2.R', 'cr_hip3.L', 'cr_hip3.R', 'sqz_buttrest.L', 'sqz_buttrest.R', 'cr_knee45.L', 'cr_knee45.R', 'cr_knee.L', 'cr_knee.R', 'sqz_knee.L', 'sqz_knee.R', 'sqz_stance.L', 'sqz_stance.R', 'cr_buttheart.L', 'cr_buttheart.R', 'rest_buttcheek.L', 'rest_buttcheek.R', 'rest_knee.L', 'rest_knee.R', 'rest_knee_fat.L', 'rest_knee_fat.R', 'rest_hip.L', 'rest_hip.R', 'vol_butt.L', 'vol_butt.R', 'Feet', 'cr_feet1.L', 'cr_feet1.R', 'cr_feet2.L', 'cr_feet2.R', 'cr_feet3.L', 'cr_feet3.R', 'cr_toe1.L', 'cr_toe1.R', 'cr_toe2.L', 'cr_toe2.R', 'Arms', 'cr_arm-up.L', 'cr_arm-up.R', 'cr_arm-fwd.L', 'cr_arm-fwd.R', 'cr_arm-dwn.L', 'cr_arm-dwn.R', 'sqz_arm-fwd.L', 'sqz_arm-fwd.R', 'sqz_armpit.L', 'sqz_armpit.R', 'sqz_arm-bcw.L', 'sqz_arm-bcw.R', 'sqz_arm-bcw_max.L', 'sqz_arm-bcw_max.R', 'cr_arm-trc.L', 'cr_arm-trc.R', 'D_cr_elbow.L', 'U_cr_elbow.L', 'D_cr_elbow.R', 'U_cr_elbow.R', 'D_sqz_elbowMax.L', 'U_sqz_elbowMax.L', 'D_sqz_elbowMax.R', 'U_sqz_elbowMax.R', 'cr_armrest.L', 'cr_armrest.R', 'cr_shoulder_fwd.L', 'cr_shoulder_fwd.R', 'cr_shoulder_bcw.L', 'cr_shoulder_bcw.R', 'cr_shoulder_dwn.L', 'cr_shoulder_dwn.R', 'cr_shoulder_up.L', 'cr_shoulder_up.R', 'rest_elbow.L', 'rest_elbow.R', 'Hands', 'cr_hand1.L', 'cr_hand1.R', 'cr_hand2.L', 'cr_hand2.R', 'cr_handtwistU.L', 'cr_handtwistU.R', 'cr_handtwistD.L', 'cr_handtwistD.R', 'cr_thumb.01.L', 'cr_thumb.01.R', 'cr_f_index.01.L', 'cr_f_index.01.R', 'cr_f_index.02.L', 'cr_f_index.02.R', 'cr_f_middle.01.L', 'cr_f_middle.01.R', 'cr_f_middle.02.L', 'cr_f_middle.02.R', 'cr_f_ring.01.L', 'cr_f_ring.01.R', 'cr_f_ring.02.L', 'cr_f_ring.02.R', 'cr_f_pinky.01.L', 'cr_f_pinky.01.R', 'cr_f_pinky.02.L', 'cr_f_pinky.02.R', 'EM', 'em_eye_close.L', 'em_eye_close.R', 'em_eye_half.L', 'em_eye_half.R', 'em_smile_open', 'em_smile_close', 'em_kiss', 'em_disg', 'em_blow', 'em_surprise', 'em_sad', 'em_frown', 'PH', 'ph_+', 'ph_bpm', 'ph_fv', 'ph_ou', 'ph_e', 'ph_r', 'ph_ch', 'ph_th', 'ph_a']
 # print(obj.data.shape_keys.key_blocks.keys())
 output_dir = root_dir + filename
 blendshape_path = root_dir + filename + '.npy'
 result = []
 bs = np.load(blendshape_path)
 for i in range(bs.shape[0]):
    obj.data.shape_keys.key_blocks['cr_eye_open'].value = i / bs.shape[0]
    bpy.context.scene.render.filepath = os.path.join(output_dir,
                                                     '{}.png'.format(i))
    bpy.ops.render.render(write_still=True)
--- a/miapia_own/aihandler.py
+++ b/miapia_own/aihandler.py
@@ -0,0 +1,36 @@
 import requests
 class AIHandler(object):
    def __init__(self):
        pass
    def __call__(self, text):
        resp = requests.post("https://fast-pia.avemio.technology/chat-completion",
                              json={
                                  "session-id": "chatcmpl",
                                  "user-location": "Zweibrücken",
                                  "wheel-of-life": [
                                      {
                                          "personal_growth": 10,
                                          "health_exercise": 5,
                                          "familiy_friends": 5,
                                          "romance_relationship": 5,
                                          "career_work": 5,
                                          "finances": 5,
                                          "recreation_fun": 5,
                                          "living_situation": 5}
                                  ],
                                  "messages": [
                                      {
                                          "role": "user",
                                          "content": text
                                      }
                                  ]
                              })
        resp = resp.json()
        return {
            "text": resp[0]['text'],
            "emotion": resp[0]['emotion']
        }
--- a/miapia_own/aihandler_stream.py
+++ b/miapia_own/aihandler_stream.py
@@ -0,0 +1,36 @@
 from time import time
 import openai
 openai.api_key = "sk-proj-PdTVVVvYzcd6vs2qcRxpT3BlbkFJtq78XfSrzwEK2fqyOVHE"
 import requests
 class AIHandlerStream(object):
    def __init__(self):
        self.ai = openai.OpenAI(api_key="sk-proj-GaouEG2QuAcfAr1an2uBT3BlbkFJaIh0XVFXWrYQpJazlbeO")
    def __call__(self, text):
        out = ""
        for chunk in self.ai.chat.completions.create(
          model="gpt-3.5-turbo",
          messages=[
            {"role": "system", "content": "You are PIA. You talk with short sentences. And help people."},
            {"role": "user", "content": text}
          ], stream=True
        ):
            delta = chunk.choices[0].delta.content
            if delta is None:
                continue
            out += delta
            if len(out) > 0 and out[-1] in ['.', '!', ',', '?']:
                yield out
                out = ""
        if len(out) > 0:
            yield out
 if __name__ == "__main__":
    aihandler = AIHandlerStream()
    t1 = time()
    for text in aihandler("Hello, how are you, what is your name?"):
        print(time() - t1)
        print(text)
--- a/miapia_own/main.py
+++ b/miapia_own/main.py
@@ -0,0 +1,571 @@
 import json
 import sys
 import re
 from time import sleep, time
 import logging
 from collections import defaultdict
 import pandas as pd
 from flask import redirect
 import argparse
 import base64
 from flask import send_file, Response, request, jsonify
 from flask_socketio import emit
 from piedemo.fields.ajax_group import AjaxChatField, AjaxGroup
 from piedemo.fields.grid import VStack, HStack, SpaceField
 from piedemo.fields.inputs.hidden import InputHiddenField
 from piedemo.fields.outputs.colored_text import ptext, OutputColoredTextField
 from piedemo.fields.outputs.json import OutputJSONField
 from piedemo.fields.outputs.progress import ProgressField
 from piedemo.fields.outputs.video import OutputVideoField
 from piedemo.hub.swagger_utils.method import describe, check_missing_keys
 from piedemo.web import Web
 import os
 import io
 from piedemo.page import Page
 from piedemo.hub.svgpil import SVGImage
 from piedemo.fields.outputs.table import OutputTableField
 from piedemo.fields.inputs.int_list import InputIntListField
 from piedemo.fields.navigation import Navigation
 from piedemo.fields.inputs.chat import ChatField
 import librosa
 import uuid
 import numpy as np
 import redis
 import argparse
 from scipy.signal import savgol_filter
 import torch
 import random
 import os, subprocess
 import shlex
 import uuid
 from tqdm import tqdm
 from aihandler import AIHandler
 from aihandler_stream import AIHandlerStream
 from pieinfer import PieInfer, render_video, construct_video
 import torch
 from TTS.api import TTS
 logging.getLogger('socketio').setLevel(logging.ERROR)
 logging.getLogger('engineio').setLevel(logging.ERROR)
 target_names = [
    "mouthSmileLeft",
    "mouthSmileRight",
    "mouthStretchLeft",
    "mouthStretchRight",
    "mouthUpperUpLeft",
    "mouthUpperUpRight",
 ]
 model_bsList = ["browDownLeft",
                "browDownRight",
                "browInnerUp",
                "browOuterUpLeft",
                "browOuterUpRight",
                "cheekPuff",
                "cheekSquintLeft",
                "cheekSquintRight",
                "eyeBlinkLeft",
                "eyeBlinkRight",
                "eyeLookDownLeft",
                "eyeLookDownRight",
                "eyeLookInLeft",
                "eyeLookInRight",
                "eyeLookOutLeft",
                "eyeLookOutRight",
                "eyeLookUpLeft",
                "eyeLookUpRight",
                "eyeSquintLeft",
                "eyeSquintRight",
                "eyeWideLeft",
                "eyeWideRight",
                "jawForward",
                "jawLeft",
                "jawOpen",
                "jawRight",
                "mouthClose",
                "mouthDimpleLeft",
                "mouthDimpleRight",
                "mouthFrownLeft",
                "mouthFrownRight",
                "mouthFunnel",
                "mouthLeft",
                "mouthLowerDownLeft",
                "mouthLowerDownRight",
                "mouthPressLeft",
                "mouthPressRight",
                "mouthPucker",
                "mouthRight",
                "mouthRollLower",
                "mouthRollUpper",
                "mouthShrugLower",
                "mouthShrugUpper",
                "mouthSmileLeft",
                "mouthSmileRight",
                "mouthStretchLeft",
                "mouthStretchRight",
                "mouthUpperUpLeft",
                "mouthUpperUpRight",
                "noseSneerLeft",
                "noseSneerRight",
                "tongueOut"]
 # Get device
 device = "cuda" if torch.cuda.is_available() else "cpu"
 blendshapes_path = "./blendshapes"
 def get_asset(fname):
    return SVGImage.open(os.path.join(os.path.dirname(__file__),
                                      "assets",
                                      fname)).svg_content
 class MainPage(Page):
    def __init__(self, model_name: str):
        super(MainPage, self).__init__()
        self.infer = PieInfer()
        self.tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
        self.r = redis.Redis(host='localhost', port=6379, decode_responses=True)
        self.aihandler = AIHandler()
        self.aihandler_stream = AIHandlerStream()
        self.fields = Navigation(AjaxGroup("ChatGroup", VStack([
            HStack([
                AjaxChatField("Chat",
                              self.register_ajax(f"/refresh_{model_name}",
                                                 self.message_sent),
                              deps_names=["sid",
                                          "session_id",
                                          "Chat",
                                          "Chat__piedemo__file"],
                              use_socketio_support=True,
                              nopie=True,
                              style={
                                  "height": "100%"
                              }),
                OutputColoredTextField("video",
                                       nopie=True,
                                       use_socketio_support=True),
            ], xs=[8, 4]),
            ProgressField("progress",
                          nopie=True,
                          use_socketio_support=True),
            InputHiddenField("session_id", None),
        ]), no_return=True), no_submit=True, page_title="MIA PIA", page_style={
        })
        self.fields.add_link("SIMPLE",
                             "/simple",
                             active=model_name == "render")
        self.fields.add_link("MIA PIA",
                             "/nice",
                             active=model_name != "render")
        self.model_name = model_name
    def get_content(self, **kwargs):
        fields = self.fields.copy()
        fields.child_loc["Chat"].set_default_options(["Hello! What is your name?", "Say one word and stop."])
        """
        fields.child_loc["Chat"].set_avatars({
            "self": get_asset("avatar.svg"),
            "ChatGPT": get_asset("dog.svg"),
        })
        """
        session_id = str(uuid.uuid4())
        return self.fill(fields, {
            "video": f"""
            """,
            "session_id": session_id,
        })
    def message_sent(self, **data):
        sid = data['sid']
        self.emit(self.fields.child_loc["Chat"].clear_input(),
                  to=sid)
        self.emit(self.fields.child_loc["video"].update(f"""
                  """))
        data = self.parse(self.fields, data)
        session_id = data['session_id']
        messages_map = self.r.hgetall(f'user-session:{session_id}')
        messages = [self.fields.child_loc["Chat"].format_message("self" if i % 2 == 0 else "ChatGPT",
                                                                 messages_map[f"message_{i}"])
                    for i in range(len(messages_map))]
        print("history: ", messages)
        text = data['Chat']['text']
        self.emit(self.fields.child_loc["Chat"].update(messages + [
            self.fields.child_loc["Chat"].format_message("self", text),
            self.fields.child_loc["Chat"].format_message("ChatGPT", "Generating text..."),
        ]), to=sid)
        output = self.aihandler(text)
        output_text = output['text']
        output_emotion = output['emotion']
        messages_map[f"message_{len(messages)}"] = text
        messages_map[f"message_{len(messages) + 1}"] = output_text
        self.r.hset(f'user-session:{session_id}', mapping=messages_map)
        self.emit(self.fields.child_loc["Chat"].update(messages + [
            self.fields.child_loc["Chat"].format_message("self", text),
            self.fields.child_loc["Chat"].format_message("ChatGPT", "Generating audio..."),
        ]), to=sid)
        self.tts.tts_to_file(text=output_text,
                             speaker_wav="/home/ubuntu/repo/of_couse_here.wav",
                             language="en",
                             emotion=output_emotion,
                             file_path=f"./audio/{session_id}.wav")
        speech_array, sampling_rate = librosa.load(f"./audio/{session_id}.wav",
                                                   sr=16000)
        output = self.infer(speech_array, sampling_rate)
        np.save(os.path.join("./audio", "{}.npy".format(session_id)),
                output)
        self.emit(self.fields.child_loc["Chat"].update(messages + [
            self.fields.child_loc["Chat"].format_message("self", text),
            self.fields.child_loc["Chat"].format_message("ChatGPT", "Rendering..."),
        ]), to=sid)
        n = output.shape[0]
        for i, fname in enumerate(tqdm(render_video(f"{session_id}",
                                                    model_name=self.model_name),
                                       total=n)):
            print("Got frame: ", fname, file=sys.stderr)
            self.emit(self.fields.child_loc["progress"].update(100 * i // n),
                      to=sid)
        construct_video(session_id)
        self.emit(self.fields.child_loc["video"].update(f"""
        <video controls="1" autoplay="1" name="media" style="border-radius: 12px; height: 80%">
            <source src="/api/video/{session_id}" type="video/mp4">
        </video>
        """), to=sid)
        '''self.emit(self.fields.child_loc["video"].update(f"""
        <img name="media" style="border-radius: 12px; height: 80%" src="/api/video/stream/{session_id}"></img>
        """))'''
        self.emit(self.fields.child_loc["Chat"].update(messages + [
            self.fields.child_loc["Chat"].format_message("self", text),
            self.fields.child_loc["Chat"].format_message("ChatGPT", output_text),
        ]), to=sid)
 page = MainPage("render")
 web = Web({
    "": "simple",
    "simple": page,
    "nice": page,
 }, use_socketio_support=True)
 host = '0.0.0.0'
 port = 8011
 debug = False
 app = web.get_app()
@app.route("/api/video/<session_id>", methods=["GET"])
 def get_video(session_id):
    return send_file("./audio/{}.mp4".format(session_id))
 def gen(session_id):
    for image_path in render_video(f"{session_id}"):
        with open(image_path, 'rb') as f:
            yield (b'--frame\r\n'
                   b'Content-Type: image/jpeg\r\n\r\n' + f.read() + b'\r\n')
    construct_video(session_id)
@app.route("/api/video/stream/<session_id>", methods=["GET"])
 def get_video_async(session_id):
    return Response(gen(session_id),
                    mimetype='multipart/x-mixed-replace; boundary=frame')
 speaker_path = "/home/ubuntu/repo/female.wav"
@app.route("/api/set_speaker", methods=["POST"])
@describe(["3dmodel"],
          name="Set emotion for 3D model",
          description="""Set speaker for 3D model""",
          inputs={
              "user_id": "This ID from article Unique Identifier for iPHONE",
              "speaker": "voice1 or voice2"
          },
          outputs={
              "status": "ok"
          })
@check_missing_keys([
    ("user_id", {"status": "error", "status_code": "missing_user_id_error"}),
    ("speaker", {"status": "error", "status_code": "missing_emotion_error"}),
 ])
 def set_speaker():
    speaker = request.json.get("speaker")
    user_id = request.json.get("user_id")
    SPEAKER[user_id] = speaker
    return jsonify({
        'status': 'ok'
    })
@app.route("/api/set_emotion", methods=["POST"])
@describe(["3dmodel"],
          name="Set emotion for 3D model",
          description="""Set emotion for 3D model""",
          inputs={
              "user_id": "This ID from article Unique Identifier for iPHONE",
              "emotion": "sad"
          },
          outputs={
              "status": "ok"
          })
@check_missing_keys([
    ("user_id", {"status": "error", "status_code": "missing_user_id_error"}),
    ("emotion", {"status": "error", "status_code": "missing_emotion_error"}),
 ])
 def set_emotion():
    emotion = request.json.get("emotion")
    user_id = request.json.get("user_id")
    EMOTIONS[user_id] = emotion
    return jsonify({
        'status': 'ok'
    })
@app.route("/api/get_texts", methods=["POST"])
@describe(["text"],
          name="Get texts for user_id",
          description="""This endpoint get all texts for current iPhone""",
          inputs={
              "user_id": "This ID from article Unique Identifier for iPHONE"
          },
          outputs={
              "text": "Output",
              "id": "bot or user",
          })
@check_missing_keys([
    ("user_id", {"status": "error", "status_code": "missing_user_id_error"}),
 ])
 def get_texts():
    user_id = request.json.get("user_id")
    return jsonify(TEXTS[user_id])
@app.route("/api/send_text", methods=["POST"])
@describe(["text"],
          name="Sent text to miapia",
          description="""This endpoint sends texts for client""",
          inputs={
              "text": "Hello, MIAPIA",
              "user_id": "This ID from article Unique Identifier for iPHONE"
          },
          outputs={
              "status": "ok"
          })
@check_missing_keys([
    ("text", {"status": "error", "status_code": "missing_text_error"}),
    ("user_id", {"status": "error", "status_code": "missing_user_id_error"}),
 ])
 def send_text():
    user_id = request.json.get("user_id")
    text = request.json.get("text", "")
    TEXTS[user_id].append({
        "id": 'user',
        "text": text
    })
    output_texts = page.aihandler_stream(text)
    bot_text = ""
    for output_text in output_texts:
        bot_text += " " + output_text
    TEXTS[user_id].append({
        "id": 'bot',
        "text": bot_text
    })
    return jsonify({
        "status": "ok",
        "messages": TEXTS[user_id]
    })
 io = web.get_socketio(app,
                      engineio_logger=False)
 head_memories = {}
 TEXTS = defaultdict(list)
 EMOTIONS = {}
 SPEAKER = {}
 def get_event(name, value, timestamp):
    return {
        "index": model_bsList.index(name),
        "value": value,
        "timestamp": timestamp
    }
 def get_value(events, name):
    index = model_bsList.index(name)
    events = [event for event in events
              if event['index'] == index]
    if len(events) == 0:
        return None
    return events[-1]['value']
 def get_head_memory():
    ids = [100, 101, 103, 104, 106, 107, 109, 110]
    return [[0, 0, 1] for _ in range(len(ids))]
 def get_head_rotations(alpha, duration, memory, sign):
    ids = [100, 101, 103, 104, 106, 107, 109, 110]
    for _ in range(3):
        index = ids.index(random.choice(ids))
        step = 0.01 * sign[index]
        memory[index][0] += step
        memory[index][0] = min(memory[index][0], memory[index][2])
        memory[index][0] = max(memory[index][0], memory[index][1])
    print(memory)
    return [{
        "index": j,
        "value": memory[i][0],
        "timestamp": float(duration * alpha)
    } for i, j in enumerate(ids)], memory
 def perform_on_text(output_text, sid, head_memory, sign, voice):
    session_id = str(uuid.uuid4())
    page.tts.tts_to_file(text=output_text,
                         speaker_wav="/home/ubuntu/repo/female.wav" if voice == "voice1" else "/home/ubuntu/repo/indian.wav",
                         language="en",
                         emotion="Happy",
                         file_path=f"./audio/{session_id}.wav")
    audio_path = f"./audio/{session_id}.wav"
    with open(audio_path, 'rb') as f:
        audio_content = f.read()
    encode_string = base64.b64encode(audio_content).decode('utf-8')
    speech_array, sampling_rate = librosa.load(audio_path,
                                               sr=16000)
    duration = librosa.get_duration(y=speech_array,
                                    sr=sampling_rate)
    output = page.infer(speech_array, sampling_rate)
    emit("io_push_audio_blob", {
        "dataURL": f"base64,{encode_string}"
    }, to=sid)
    print("Sent audio.")
    emit("io_set_size", {
        "size": output.shape[0],
    }, to=sid)
    t1 = time()
    for i in tqdm(range(output.shape[0])):
        rots, head_memory = get_head_rotations((i / output.shape[0]), duration, head_memory, sign)
        blendshapes_i = [{
            "index": j,
            "value": output[i, j],
            "timestamp": float(duration * (i / output.shape[0]))
        } for j in range(output.shape[1])] + rots
        if max([get_value(blendshapes_i, target_name)
                for target_name in target_names]) > 0.5:
            os.makedirs(blendshapes_path,
                        exist_ok=True)
            save_blendshapes_i = os.path.join(blendshapes_path,
                                              str(uuid.uuid4()) + '.json')
            with open(save_blendshapes_i, 'w') as f:
                json.dump(blendshapes_i, f)
        emit("io_set_coef", blendshapes_i, to=sid)
        # sleep(0.1 * duration / output.shape[0])
    t2 = time()
    sleep(max(0., duration - (t2 - t1)))
    return head_memory
 def perform_surgery(sid, duration=5):
    with open("../5-seconds-of-silence.wav", 'rb') as f:
        audio_content = f.read()
    encode_string = base64.b64encode(audio_content).decode('utf-8')
    fps = 20
    emit("io_push_audio_blob", {
        "dataURL": f"base64,{encode_string}"
    }, to=sid)
    print("Sent audio.")
    emit("io_set_size", {
        "size": (fps * duration)
    }, to=sid)
    t1 = time()
    for i in tqdm(range(fps * duration)):
        alpha = float(i / (fps * duration))
        emit("io_set_coef", [
            get_event("eyeWideLeft",
                      0.3 - 0.3 * alpha,
                      float(duration * alpha)),
            get_event("eyeWideRight",
                      0.3 - 0.3 * alpha,
                      float(duration * alpha))
        ], to=sid)
    t2 = time()
    sleep(max(0., duration - (t2 - t1)))
@io.on("io_set_text")
 def io_set_text(data):
    data = json.loads(data)
    data = data[0]
    sid = None
    print(data, file=sys.stderr)
    if "text" not in data:
        emit("io_error", {"message": "Text not found"},
             to=sid)
        return
    text = data["text"]
    """if "user_id" not in data:
        emit("io_error", {"message": "User not found"},
             to=sid)
        return"""
    user_id = data.get('user_id')
    print(user_id)
    TEXTS[user_id].append({
        "id": "user",
        "text": text
    })
    voice = SPEAKER.get(user_id, "voice1")
    if sid not in head_memories:
        head_memories[sid] = get_head_memory()
    head_memory = head_memories[sid]
    # output_texts = [page.aihandler(text)['text']]
    output_texts = page.aihandler_stream(text)
    bot_text = ""
    for output_text in output_texts:
        sign = [2 * (random.random() > 0.5) - 1
                for _ in range(8)]
        head_memory = perform_on_text(output_text, sid, head_memory,
                                      sign=sign,
                                      voice=voice)
        bot_text += " " + output_text
    print("SURGERY STARTED!")
    # perform_surgery(sid)
    print("SURGERY ENDED!")
    TEXTS[user_id].append({
        "id": "bot",
        "text": bot_text
    })
    emit("io_finish", {}, to=sid)
 io.run(app,
       host=host, port=port, debug=debug,
       allow_unsafe_werkzeug=True)
--- a/miapia_own/pieinfer.py
+++ b/miapia_own/pieinfer.py
@@ -0,0 +1,154 @@
 import librosa
 import numpy as np
 import argparse
 from torch import cuda
 from parse import parse
 from scipy.signal import savgol_filter
 import torch
 from model import EmoTalk
 import random
 import os, subprocess
 import shlex
 from munch import Munch
@torch.no_grad()
 def test(model, speech_array, sampling_rate):
    args = Munch(
        bs_dim=52,
        feature_dim=832,
        period=30,
        device="cuda",
        model_path="./pretrain_model/EmoTalk.pth",
        max_seq_len=5000,
        num_workers=0,
        batch_size=1,
        post_processing=True,
        blender_path="./blender/blender")
    eye1 = np.array([0.36537236, 0.950235724, 0.95593375, 0.916715622, 0.367256105, 0.119113259, 0.025357503])
    eye2 = np.array([0.234776169, 0.909951985, 0.944758058, 0.777862132, 0.191071674, 0.235437036, 0.089163929])
    eye3 = np.array([0.870040774, 0.949833691, 0.949418545, 0.695911646, 0.191071674, 0.072576277, 0.007108896])
    eye4 = np.array([0.000307991, 0.556701422, 0.952656746, 0.942345619, 0.425857186, 0.148335218, 0.017659493])
    # speech_array, sampling_rate = librosa.load(os.path.join(wav_path), sr=16000)
    audio = torch.FloatTensor(speech_array).unsqueeze(0).to(args.device)
    level = torch.tensor([1]).to(args.device)
    person = torch.tensor([0]).to(args.device)
    prediction = model.predict(audio, level, person)
    prediction = prediction.squeeze().detach().cpu().numpy()
    if args.post_processing:
        output = np.zeros((prediction.shape[0], prediction.shape[1]))
        for i in range(prediction.shape[1]):
            output[:, i] = savgol_filter(prediction[:, i], 5, 2)
        output[:, 8] = 0
        output[:, 9] = 0
        i = random.randint(0, 60)
        while i < output.shape[0] - 7:
            eye_num = random.randint(1, 4)
            if eye_num == 1:
                output[i:i + 7, 8] = eye1
                output[i:i + 7, 9] = eye1
            elif eye_num == 2:
                output[i:i + 7, 8] = eye2
                output[i:i + 7, 9] = eye2
            elif eye_num == 3:
                output[i:i + 7, 8] = eye3
                output[i:i + 7, 9] = eye3
            else:
                output[i:i + 7, 8] = eye4
                output[i:i + 7, 9] = eye4
            time1 = random.randint(60, 180)
            i = i + time1
        return output
    else:
        return prediction
 def render_video(wav_name, model_name):
    args = Munch(
        bs_dim=52,
        feature_dim=832,
        period=30,
        device="cuda",
        model_path="./pretrain_model/EmoTalk.pth",
        max_seq_len=5000,
        num_workers=0,
        batch_size=1,
        post_processing=True,
        blender_path="./blender/blender")
    # wav_name = args.wav_path.split('/')[-1].split('.')[0]
    image_path = os.path.join("./audio", wav_name)
    os.makedirs(image_path, exist_ok=True)
    blender_path = args.blender_path
    python_path = f"./{model_name}.py"
    blend_path = f"./{model_name}.blend"
    print(python_path, blend_path)
    # python_path = "./render.py"
    # blend_path = "./render.blend"
    cmd = '{} -t 64 -b {} -P {} -- "{}" "{}" '.format(blender_path,
                                                      blend_path,
                                                      python_path,
                                                      "./audio/",
                                                      wav_name)
    cmd = shlex.split(cmd)
    p = subprocess.Popen(cmd,
                         shell=False,
                         stdout=subprocess.PIPE,
                         stderr=subprocess.STDOUT)
    while p.poll() is None:
        line = p.stdout.readline().decode('utf-8')
        line = line.strip()
        if line and line.startswith('Saved: '):
            fname = parse("Saved: '{}'", line).fixed[0]
            yield fname
        else:
            print(line)
    if p.returncode == 0:
        print('Subprogram success')
    else:
        print('Subprogram failed')
 def construct_video(wav_name):
    image_path = os.path.join("./audio", wav_name)
    os.makedirs(image_path, exist_ok=True)
    image_temp = image_path + "/%d.png"
    output_path = os.path.join("./audio", wav_name + ".mp4")
    cmd = 'ffmpeg -r 30 -i "{}" -i "{}" -pix_fmt yuv420p -s 512x768 "{}" -y'.format(image_temp,
                                                                                    f"./audio/{wav_name}.wav",
                                                                                    output_path)
    subprocess.call(cmd, shell=True)
    cmd = 'rm -rf "{}"'.format(image_path)
    subprocess.call(cmd, shell=True)
 class PieInfer(object):
    def __init__(self):
        args = Munch(
            bs_dim=52,
            feature_dim=832,
            period=30,
            device="cuda" if cuda.is_available() else "cpu",
            model_path="./pretrain_model/EmoTalk.pth",
            max_seq_len=5000,
            num_workers=0,
            batch_size=1,
            post_processing=True,
            blender_path="./blender/blender")
        #"""
        model = EmoTalk(args)
        model.load_state_dict(torch.load(args.model_path, map_location=torch.device(args.device)), strict=False)
        model = model.to(args.device)
        model.eval()
        #"""
        # model = None
        self.model = model
    def __call__(self,
                 speech_array,
                 sampling_rate):
        return test(self.model, speech_array, sampling_rate)
--- a/miapia_own/test_new_model.py
+++ b/miapia_own/test_new_model.py
@@ -0,0 +1,78 @@
 import sys
 import pandas as pd
 import argparse
 import base64
 from flask import send_file, Response
 from flask_socketio import emit
 from piedemo.fields.ajax_group import AjaxChatField, AjaxGroup
 from piedemo.fields.grid import VStack, HStack, SpaceField
 from piedemo.fields.inputs.hidden import InputHiddenField
 from piedemo.fields.outputs.colored_text import ptext, OutputColoredTextField
 from piedemo.fields.outputs.json import OutputJSONField
 from piedemo.fields.outputs.progress import ProgressField
 from piedemo.fields.outputs.video import OutputVideoField
 from piedemo.web import Web
 from piedemo.page import Page
 from piedemo.hub.svgpil import SVGImage
 from piedemo.fields.outputs.table import OutputTableField
 from piedemo.fields.inputs.int_list import InputIntListField
 from piedemo.fields.navigation import Navigation
 from piedemo.fields.inputs.chat import ChatField
 import librosa
 import uuid
 import numpy as np
 import redis
 import argparse
 from scipy.signal import savgol_filter
 import torch
 import random
 import os, subprocess
 import shlex
 from tqdm import tqdm
 class MainPage(Page):
    def __init__(self, model_name: str):
        super(MainPage, self).__init__()
 web = Web({
    "": "simple",
    "simple": MainPage("render"),
    # "nice": MainPage("FemAdv_b350_V2_050523"),
 }, use_socketio_support=True)
 host = '0.0.0.0'
 port = 8011
 debug = False
 app = web.get_app()
 io = web.get_socketio(app)
@io.on("io_set_text")
 def io_set_text(data):
    sid = None
    if "text" not in data:
        emit("io_error", {"message": "Text not found"},
             to=sid)
    encode_string = base64.b64encode(open("../feeling_good.wav", "rb").read())
    for i in range(10):
        j = random.randint(0, 2)
        emit("io_set_coef", [{
            "index": j,
            "value": i / 10,
        }], to=sid)
        emit("io_push_audio_blob", {
            "dataURL": f"base64,{encode_string}"
        }, to=sid)
    emit("io_finish", {}, to=sid)
 io.run(app,
       host=host, port=port, debug=debug,
       allow_unsafe_werkzeug=True)
--- a/miapia_stream/init.py
+++ b/miapia_stream/init.py
--- a/server/connect.sh
+++ b/server/connect.sh
@@ -0,0 +1 @@
 ssh -i ~/.ssh/id_rsa_miapia ubuntu@54.172.214.227
--- a/server/sync_code.sh
+++ b/server/sync_code.sh
@@ -0,0 +1 @@
 rsync --rsh='ssh -i ~/.ssh/id_rsa_miapia -o IdentitiesOnly=yes' --exclude='.git' --exclude 'node_modules' -Pav ../miapia_own/ ubuntu@54.172.214.227:/home/ubuntu/repo/EmoTalk_release
--- a/server/sync_code_mia.sh
+++ b/server/sync_code_mia.sh
@@ -0,0 +1 @@
 rsync --rsh='ssh -i ~/.ssh/id_rsa_miapia -o IdentitiesOnly=yes' --exclude='.git' --exclude 'node_modules' -Pav ../ ubuntu@54.172.214.227:/home/ubuntu/repo/
--- a/t2a_api.py
+++ b/t2a_api.py
@@ -0,0 +1,12 @@
 from transformers import pipeline
 import scipy
 class T2A(object):
    def __init__(self):
        self.synthesiser = pipeline("text-to-speech", "suno/bark")
    def apply(self, text):
        speech = self.synthesiser(text,
                                  forward_params={"do_sample": True})
        scipy.io.wavfile.write("bark_out.wav", rate=speech["sampling_rate"], data=speech["audio"])
--- a/test_a2f_api.py
+++ b/test_a2f_api.py
@@ -0,0 +1,38 @@
 import math
 import os
 import requests
 from pprint import pprint
 import argparse
 import soundfile
 from a2f_api import A2F
 parser = argparse.ArgumentParser()
 parser.add_argument("audio_path")
 parser.add_argument("--host", type=str,
                    default="https://a2fdemo.piedata.ai/")
 args = parser.parse_args()
 a2f = A2F(args.host)
 print(f"Uploading {args.audio_path}...")
 server_audio_path = a2f.upload(args.audio_path)
 fname = os.path.basename(server_audio_path)
 print("Status: ", a2f.status())
 print("EmotionNames: ", a2f.get_emotion_names())
 print("Scene Objects: ", a2f.get_scene_objects())
 print("Scene Players: ", a2f.get_players())
 print("Preprocessing settings: ", a2f.get_pre_settings())
 print("Postprocessing settings: ", a2f.get_post_settings())
 print("Setting player root: ", a2f.set_player_root("/home/ubuntu/results"))
 print("Player root: ", a2f.get_player_root())
 print("Setting audio: ", a2f.set_audio(os.path.basename(server_audio_path)))
 print("Audio Range: ", a2f.get_audio_range())
 print("Running: ", a2f.run())
 print("NumKeys: ", a2f.get_number_of_keys())
 print("Keys: ", a2f.get_generated_keys())
 # print("BlendShape solvers: ", a2f.get_blendshape_solvers())
 print("Exporting: ", a2f.export_json("/home/ubuntu/results",
                                     filename=os.path.splitext(fname)[0]))
 print(f"Pulling to ./{fname}...")
 a2f.pull(fname)
		`@@ -0,0 +1,4 @@`
							`from piedemo.web import Web`
							`from piedemo.fields.ajax_group import AjaxChatField`
		`@@ -0,0 +1 @@`
							`ssh -i ~/.ssh/id_rsa_miapia ubuntu@54.172.214.227`
		`@@ -0,0 +1 @@`
							`rsync --rsh='ssh -i ~/.ssh/id_rsa_miapia -o IdentitiesOnly=yes' --exclude='.git' --exclude 'node_modules' -Pav ../miapia_own/ ubuntu@54.172.214.227:/home/ubuntu/repo/EmoTalk_release`