Francis0917's picture
Upload folder using huggingface_hub
2045faa verified
import math
import tensorflow as tf
from tensorflow.keras import layers
def make_adjacency_matrix(speech_mask, text_mask):
"""
args
speech_mask : [B, Ls]
text_mask : [B, Lt]
"""
# [B, L] -> [B]
n_speech = tf.math.reduce_sum(tf.cast(speech_mask, tf.float32), -1)
n_text = tf.math.reduce_sum(tf.cast(text_mask, tf.float32), -1)
n_node = n_speech + n_text
max_len = tf.math.reduce_max(n_node)
# [B] -> [B, max_len] -> [B, max_len, 1] * [B, 1, max_len]-> [B, max_len, max_len]
mask = tf.sequence_mask(n_node, maxlen=max_len, dtype=tf.float32)
mask = tf.expand_dims(mask, -1) * tf.expand_dims(mask, 1)
# Make upper triangle matrix for adj. matrix
adjacency_matrix = tf.linalg.band_part(mask, -1, 0)
return adjacency_matrix
def make_feature_matrix(speech_features, speech_mask, text_features, text_mask):
"""
args
speech_features : [B, Ls, F]
speech_mask : [B, Ls]
text_features : [B, Lt, F]
text_mask : [B, Lt]
"""
# Data pre-processing
speech_mask = tf.cast(speech_mask, tf.float32)
text_mask = tf.cast(text_mask, tf.float32)
speech_seq_mask = tf.tile(tf.expand_dims(speech_mask, -1), tf.constant([1, 1, speech_features.shape[-1]], tf.int32))
text_seq_mask = tf.tile(tf.expand_dims(text_mask, -1), tf.constant([1, 1, text_features.shape[-1]], tf.int32))
speech_features *= speech_seq_mask
text_features *= text_seq_mask
# Concatenate two feature matrix along time axis
feature_matrix = tf.concat([speech_features, text_features], axis=1)
feature_mask = tf.concat([speech_mask, text_mask], axis=-1)
# Gather valid data using mask : tensor -> ragged tensor -> tensor
return tf.ragged.boolean_mask(feature_matrix, tf.cast(feature_mask, tf.bool)).to_tensor(0.)