Getting timestamps for words spoken in ASR

Is there an easy way to do this? To timestamp the start and end times of words in ASR.

Hi, @seanle24245
There is no easy way to do this. We should fix it soon. At the moment you can do it only modifying __call__ method of NeMoASR class in deeppavlov/models/nemo/

    def __call__(self, audio_batch: List[Union[str, BytesIO]]) -> Tuple[list, list]:
        """Transcripts audio batch to text.

            audio_batch: Batch to be transcribed. Elements could be either paths to audio files or Binary I/O objects.

            text_batch: Batch of transcripts.

        data_layer = AudioInferDataLayer(audio_batch=audio_batch, **self.nemo_params['AudioToTextDataLayer'])
        audio_signal, audio_signal_len = data_layer()
        processed_signal, processed_signal_len = self.data_preprocessor(input_signal=audio_signal,
        encoded, encoded_len = self.jasper_encoder(audio_signal=processed_signal, length=processed_signal_len)
        log_probs = self.jasper_decoder(encoder_output=encoded)
        predictions = self.greedy_decoder(log_probs=log_probs)
        eval_tensors = [predictions, audio_signal_len]
        tensors = self.neural_factory.infer(tensors=eval_tensors)

        def __ctc_decoder_predictions_tensor(tensor, signal_lengths, labels):
            Decodes a sequence of labels to words
            blank_id = len(labels)
            hypotheses = []
            symbol_indexes = []
            labels_map = dict([(i, labels[i]) for i in range(len(labels))])
            prediction_cpu_tensor = tensor.long().cpu()
            # iterate over batch
            for ind in range(prediction_cpu_tensor.shape[0]):
                prediction = prediction_cpu_tensor[ind].numpy().tolist()
                prediction_len = len(prediction)
                samples_num = signal_lengths[ind].numpy()
                # CTC decoding procedure
                decoded_prediction = []
                hyp_sample_num = []
                previous = len(labels)  # id of a blank symbol
                for i,p in enumerate(prediction):
                    if (p != previous or previous == blank_id) and p != blank_id:
                    previous = p
                hypothesis = ''.join([labels_map[c] for c in decoded_prediction])
            return hypotheses, symbol_indexes

        def post_process_predictions(predictions_list: list, audio_signal_lengths: list, labels: list):
            transctiptions, symbol_indexes = [], []
            for prediction, signal_lengths in zip(predictions_list, audio_signal_lengths):
                hyp, sn = __ctc_decoder_predictions_tensor(prediction, signal_lengths, labels=labels)
                transctiptions += hyp
                symbol_indexes += sn
            return transctiptions, symbol_indexes

        text_batch, indexes_batch = post_process_predictions(tensors[0], tensors[1], self.labels)
        return text_batch, indexes_batch

and modifying config file:

  "chainer": {
    "in": "speech",
    "pipe": [
        "class_name": "nemo_asr",
        "nemo_params_path": "{NEMO_PATH}/quartznet15x5/quartznet15x5.yaml",
        "load_path": "{NEMO_PATH}/quartznet15x5",
        "in": ["speech"],
        "out": ["text", "indexes"]
    "out": ["text", "indexes"]
  "metadata": {
    "variables": {
      "NEMO_PATH": "~/.deeppavlov/models/nemo"
    "requirements": [
    "download": [
        "url": "",
        "subdir": "{NEMO_PATH}"