Is there an easy way to do this? To timestamp the start and end times of words in ASR.
Hi, @seanle24245
There is no easy way to do this. We should fix it soon. At the moment you can do it only modifying __call__
method of NeMoASR
class in deeppavlov/models/nemo/asr.py
:
def __call__(self, audio_batch: List[Union[str, BytesIO]]) -> Tuple[list, list]:
"""Transcripts audio batch to text.
Args:
audio_batch: Batch to be transcribed. Elements could be either paths to audio files or Binary I/O objects.
Returns:
text_batch: Batch of transcripts.
"""
data_layer = AudioInferDataLayer(audio_batch=audio_batch, **self.nemo_params['AudioToTextDataLayer'])
audio_signal, audio_signal_len = data_layer()
processed_signal, processed_signal_len = self.data_preprocessor(input_signal=audio_signal,
length=audio_signal_len)
encoded, encoded_len = self.jasper_encoder(audio_signal=processed_signal, length=processed_signal_len)
log_probs = self.jasper_decoder(encoder_output=encoded)
predictions = self.greedy_decoder(log_probs=log_probs)
eval_tensors = [predictions, audio_signal_len]
tensors = self.neural_factory.infer(tensors=eval_tensors)
def __ctc_decoder_predictions_tensor(tensor, signal_lengths, labels):
"""
Decodes a sequence of labels to words
"""
blank_id = len(labels)
hypotheses = []
symbol_indexes = []
labels_map = dict([(i, labels[i]) for i in range(len(labels))])
prediction_cpu_tensor = tensor.long().cpu()
# iterate over batch
for ind in range(prediction_cpu_tensor.shape[0]):
prediction = prediction_cpu_tensor[ind].numpy().tolist()
prediction_len = len(prediction)
samples_num = signal_lengths[ind].numpy()
# CTC decoding procedure
decoded_prediction = []
hyp_sample_num = []
previous = len(labels) # id of a blank symbol
for i,p in enumerate(prediction):
if (p != previous or previous == blank_id) and p != blank_id:
decoded_prediction.append(p)
hyp_sample_num.append(int(samples_num*i/prediction_len))
previous = p
hypothesis = ''.join([labels_map[c] for c in decoded_prediction])
hypotheses.append(hypothesis)
symbol_indexes.append(hyp_sample_num)
return hypotheses, symbol_indexes
def post_process_predictions(predictions_list: list, audio_signal_lengths: list, labels: list):
transctiptions, symbol_indexes = [], []
for prediction, signal_lengths in zip(predictions_list, audio_signal_lengths):
hyp, sn = __ctc_decoder_predictions_tensor(prediction, signal_lengths, labels=labels)
print(sn)
transctiptions += hyp
symbol_indexes += sn
return transctiptions, symbol_indexes
text_batch, indexes_batch = post_process_predictions(tensors[0], tensors[1], self.labels)
return text_batch, indexes_batch
and modifying config file:
{
"chainer": {
"in": "speech",
"pipe": [
{
"class_name": "nemo_asr",
"nemo_params_path": "{NEMO_PATH}/quartznet15x5/quartznet15x5.yaml",
"load_path": "{NEMO_PATH}/quartznet15x5",
"in": ["speech"],
"out": ["text", "indexes"]
}
],
"out": ["text", "indexes"]
},
"metadata": {
"variables": {
"NEMO_PATH": "~/.deeppavlov/models/nemo"
},
"requirements": [
"{DEEPPAVLOV_PATH}/requirements/pytorch.txt",
"{DEEPPAVLOV_PATH}/requirements/nemo-asr.txt"
],
"download": [
{
"url": "http://files.deeppavlov.ai/deeppavlov_data/nemo/quartznet15x5.tar.gz",
"subdir": "{NEMO_PATH}"
}
]
}
}
after getting symbol indexes, how do i convert it to symbol timestamps?