/*! @license
* Shaka Player
* Copyright 2016 Google LLC
* SPDX-License-Identifier: Apache-2.0
*/
goog.provide('shaka.text.SpeechToText');
goog.require('goog.asserts');
goog.require('shaka.log');
goog.require('shaka.util.ArrayUtils');
goog.require('shaka.util.Dom');
goog.require('shaka.util.EventManager');
goog.require('shaka.util.FakeEvent');
goog.require('shaka.util.IReleasable');
goog.require('shaka.util.ManifestParserUtils');
goog.require('shaka.util.Lazy');
goog.require('shaka.util.Timer');
goog.requireType('shaka.Player');
/**
* @implements {shaka.util.IReleasable}
*/
shaka.text.SpeechToText = class {
/**
* @param {shaka.Player} player
*/
constructor(player) {
/** @private {?shaka.Player} */
this.player_ = player;
/** @private {?shaka.extern.SpeechToTextConfiguration} */
this.config_ = null;
/** @private {!shaka.util.EventManager} */
this.eventManager_ = new shaka.util.EventManager();
/** @private {boolean} */
this.supported_ =
shaka.text.SpeechToText.isMediaStreamTrackSupported.value();
/** @type {HTMLElement} */
this.textContainer_ = this.getTextContainer_();
/** @private {boolean} */
this.enabled_ = false;
/** @private {?ChromeSpeechRecognition} */
this.recognition_ = null;
/** @private {?Translator} */
this.translator_ = null;
/** @private {?AbortController} */
this.translatorAbortController_ = null;
/** @private {boolean} */
this.needTranslator_ = false;
/** @private {!shaka.util.EventManager} */
this.recognitionEventManager_ = new shaka.util.EventManager();
/** @private {shaka.util.Timer} */
this.recognitionTimer_ = new shaka.util.Timer(() => {
this.stopRecognition_();
this.onAudioTrackChange_();
});
/** @private {number} */
this.nextTextTrackId_ = 1e15;
/** @private {shaka.extern.TextTrack} */
this.basicTextTrack_ = this.createTextTrack_();
/** @private {!Array<shaka.extern.TextTrack>} */
this.textTracks_ = [
this.basicTextTrack_,
];
/** @private {?number} */
this.activeTrackId_ = null;
}
/**
* @param {shaka.extern.SpeechToTextConfiguration} config
*/
configure(config) {
this.config_ = config;
this.checkTextTrackChanges_();
}
/**
* @override
*/
release() {
this.activeTrackId_ = null;
this.eventManager_.removeAll();
this.stopRecognition_();
this.player_ = null;
this.eventManager_.release();
// Remove the text container element from the UI.
if (this.textContainer_ && this.textContainer_.parentElement) {
this.textContainer_.remove();
this.textContainer_ = null;
}
}
/**
* Enable speech to text.
*
* @param {!shaka.extern.TextTrack} track
*/
enable(track) {
if (!this.supported_) {
return;
}
if (!this.textContainer_) {
this.textContainer_ = this.getTextContainer_();
}
if (!this.textContainer_) {
return;
}
if (this.enabled_ && track.id == this.activeTrackId_) {
return;
}
this.enabled_ = true;
this.activeTrackId_ = track.id;
this.eventManager_.listen(this.player_, 'audiotrackschanged', () => {
this.onAudioTrackChange_();
});
const mediaElement = this.player_.getMediaElement();
this.eventManager_.listen(mediaElement, 'seeking', () => {
this.stopRecognition_();
this.onAudioTrackChange_();
});
this.eventManager_.listen(mediaElement, 'pause', () => {
this.stopRecognition_(/* removeRendered= */ false);
});
this.eventManager_.listen(mediaElement, 'play', () => {
this.onAudioTrackChange_();
});
if (!mediaElement.paused) {
this.onAudioTrackChange_();
}
}
/**
* Disable speech to text.
*/
disable() {
if (!this.enabled_) {
return;
}
this.enabled_ = false;
this.activeTrackId_ = null;
this.eventManager_.removeAll();
this.stopRecognition_();
}
/**
* @return {boolean}
*/
isEnabled() {
return this.enabled_;
}
/**
* @return {boolean}
*/
isSupported() {
if (!this.supported_) {
return false;
}
if (!this.textContainer_) {
this.textContainer_ = this.getTextContainer_();
}
if (!this.textContainer_) {
return false;
}
return true;
}
/**
* @return {!Array<shaka.extern.TextTrack>}
*/
getTextTracks() {
if (!this.isSupported()) {
return [];
}
for (const textTrack of this.textTracks_) {
textTrack.active = textTrack.id == this.activeTrackId_;
}
return this.textTracks_;
}
/**
* @private
*/
onAudioTrackChange_() {
this.removeRenderedText_();
const audioTracks = this.player_.getAudioTracks();
if (audioTracks.length) {
const mediaStreamTrack = this.getAudioTrackFromMediaElement_();
if (!mediaStreamTrack) {
return;
}
const activeAudioTrack = audioTracks.find((t) => t.active);
let sourceLanguage = 'en';
if (activeAudioTrack && activeAudioTrack.language &&
activeAudioTrack.language != 'und') {
sourceLanguage = activeAudioTrack.language;
}
const activeTextTrack =
this.textTracks_.find((t) => t.id == this.activeTrackId_);
let targetLanguage = '';
if (activeTextTrack && activeTextTrack.language &&
activeTextTrack.language != 'und') {
targetLanguage = activeTextTrack.language;
}
if (this.recognition_ && this.recognition_.lang == sourceLanguage) {
this.setupTranslator_(sourceLanguage, targetLanguage).catch(() => {});
return;
}
this.initRecognition_(mediaStreamTrack, sourceLanguage, targetLanguage);
}
}
/**
* @param {!MediaStreamTrack} mediaStreamTrack
* @param {string} sourceLanguage
* @param {string} targetLanguage
* @private
*/
initRecognition_(mediaStreamTrack, sourceLanguage, targetLanguage) {
goog.asserts.assert(this.config_, 'Config must not be null!');
this.stopRecognition_();
this.setupTranslator_(sourceLanguage, targetLanguage).catch(() => {});
const SpeechRecognition =
window.SpeechRecognition || window.webkitSpeechRecognition;
this.recognition_ = /** @type {ChromeSpeechRecognition} */(
new SpeechRecognition());
this.recognition_.lang = sourceLanguage;
this.recognition_.continuous = true;
this.recognition_.interimResults = true;
this.recognition_.processLocally = this.config_.processLocally;
this.recognitionEventManager_.listen(this.recognition_, 'start', () => {
shaka.log.debug('Speech to text: start', sourceLanguage);
this.recognitionTimer_.tickAfter(/* seconds= */ 5);
});
this.recognitionEventManager_.listen(this.recognition_, 'result',
async (e) => {
goog.asserts.assert(this.config_, 'Config must not be null!');
const event = /** @type {SpeechRecognitionEvent} */(e);
let text = '';
for (let i = event.resultIndex; i < event.results.length; i++) {
// The Web Speech API adds appropriate leading/trailing
// whitespace.
text += event.results[i][0].transcript;
}
if (this.needTranslator_) {
if (this.translator_) {
try {
text = await this.translator_.translate(text);
} catch (e) {
return;
}
} else {
return;
}
}
if (this.textContainer_) {
this.removeRenderedText_();
const elem = shaka.util.Dom.createHTMLElement('span');
elem.setAttribute('translate', 'no');
elem.style.backgroundColor = 'rgba(0, 0, 0, 0.8)';
elem.style.padding = '0px 5px';
elem.style.margin = '2.5% 5%';
elem.textContent =
this.truncateLastWords_(text, this.config_.maxTextLength);
this.textContainer_.appendChild(elem);
}
this.recognitionTimer_.tickAfter(/* seconds= */ 0.75);
});
this.recognitionEventManager_.listen(this.recognition_, 'error', (e) => {
this.removeRenderedText_();
shaka.log.debug('Speech to text: error', e);
});
this.recognitionEventManager_.listen(this.recognition_, 'end', () => {
shaka.log.debug('Speech to text: end', sourceLanguage);
this.initRecognition_(mediaStreamTrack, sourceLanguage, targetLanguage);
});
this.recognition_.start(mediaStreamTrack);
}
/**
* @param {string} sourceLanguage
* @param {string} targetLanguage
* @return {!Promise}
* @private
*/
async setupTranslator_(sourceLanguage, targetLanguage) {
if (this.translatorAbortController_) {
this.translatorAbortController_.abort();
this.translatorAbortController_ = null;
}
if (this.translator_) {
this.translator_.destroy();
this.translator_ = null;
}
if (targetLanguage && sourceLanguage != targetLanguage &&
'Translator' in window) {
this.needTranslator_ = true;
this.translatorAbortController_ = new AbortController();
const signal = this.translatorAbortController_.signal;
try {
this.translator_ = await Translator.create({
sourceLanguage: sourceLanguage,
targetLanguage: targetLanguage,
signal: signal,
});
} catch (err) {
if (!err.name || err.name !== 'AbortError') {
const languages = {
sourceLanguage: sourceLanguage,
targetLanguage: targetLanguage,
};
shaka.log.error('Error creating Translator', languages, err);
if (err.name == 'NotSupportedError') {
this.stopRecognition_();
}
}
}
} else {
this.needTranslator_ = false;
}
}
/**
* @param {boolean=} removeRendered
* @private
*/
stopRecognition_(removeRendered = true) {
this.recognitionEventManager_.removeAll();
this.recognitionTimer_.stop();
this.needTranslator_ = false;
if (this.translatorAbortController_) {
this.translatorAbortController_.abort();
this.translatorAbortController_ = null;
}
if (this.translator_) {
this.translator_.destroy();
this.translator_ = null;
}
if (this.recognition_) {
this.recognition_.stop();
this.recognition_ = null;
}
if (removeRendered) {
this.removeRenderedText_();
}
}
/**
* @private
*/
removeRenderedText_() {
if (this.textContainer_) {
shaka.util.Dom.removeAllChildren(this.textContainer_);
}
}
/**
* Truncates a string to the last `limit` characters, ensuring that only
* complete words are included. If a word is cut at the limit, it is included
* in full. Adds '...' at the start if truncation occurs.
*
* @param {string} text - The input string to truncate.
* @param {number} limit - The maximum number of characters to consider from
* the end of the string.
* @return {string} The truncated string, starting at the first complete word
* within the limit, and prefixed with '...' if truncation
* was applied.
* @private
*/
truncateLastWords_(text, limit) {
if (text.length <= limit) {
return text;
}
// Start from the position where the last `limit` characters begin
let start = text.length - limit;
// Move backwards to the start of the word if we are in the middle of one
while (start > 0 && text[start - 1] !== ' ') {
start--;
}
// Take the substring from the found position to the end
const result = text.slice(start).trimStart();
// Add '...' at the start to indicate truncation
return '...' + result;
}
/**
* @return {?MediaStreamTrack}
* @private
*/
getAudioTrackFromMediaElement_() {
const mediaElement = this.player_.getMediaElement();
if (!mediaElement) {
return null;
}
if (!shaka.text.SpeechToText.audioObjectMap_.has(mediaElement)) {
const AudioContext = window.AudioContext || window.webkitAudioContext;
const audioContext = new AudioContext();
const sourceNode = audioContext.createMediaElementSource(mediaElement);
const destinationNode = audioContext.createMediaStreamDestination();
sourceNode.connect(destinationNode);
sourceNode.connect(audioContext.destination);
const audioTrack = destinationNode.stream.getAudioTracks()[0];
shaka.text.SpeechToText.audioObjectMap_.set(mediaElement, {
audioContext,
sourceNode,
destinationNode,
audioTrack,
});
}
const audioObject =
shaka.text.SpeechToText.audioObjectMap_.get(mediaElement);
return audioObject.audioTrack;
}
/**
* @return {?HTMLElement}
* @private
*/
getTextContainer_() {
const videoContainer = this.player_.getVideoContainer();
if (!videoContainer) {
return null;
}
/** @type {HTMLElement} */
const textContainer = shaka.util.Dom.createHTMLElement('div');
textContainer.classList.add('shaka-speech-to-text-container');
// Set the subtitles text-centered by default.
textContainer.style.textAlign = 'center';
// Set the captions in the middle horizontally by default.
textContainer.style.display = 'flex';
textContainer.style.flexDirection = 'column';
textContainer.style.alignItems = 'center';
// Set the captions at the bottom by default.
textContainer.style.justifyContent = 'flex-end';
videoContainer.appendChild(textContainer);
return textContainer;
}
/** @private */
checkTextTrackChanges_() {
goog.asserts.assert(this.config_, 'Config must not be null!');
const existingTrackLanguages =
this.textTracks_.map((t) => t.language).filter((t) => t);
const languageChanges = !shaka.util.ArrayUtils.hasSameElements(
this.config_.languagesToTranslate, existingTrackLanguages);
if (languageChanges && 'Translator' in window) {
this.textTracks_ = this.textTracks_.filter((t) => {
if (t.id == this.basicTextTrack_.id) {
return true;
}
if (this.config_.languagesToTranslate.includes(t.language)) {
return true;
}
if (t.id == this.activeTrackId_) {
this.disable();
}
return false;
});
for (const language of this.config_.languagesToTranslate) {
let track = this.textTracks_.find((t) => t.language == language);
if (!track) {
track = this.createTextTrack_();
track.language = language;
this.textTracks_.push(track);
}
}
const event = new shaka.util.FakeEvent(
shaka.util.FakeEvent.EventName.TextChanged);
this.player_.dispatchEvent(event);
}
}
/**
* @return {shaka.extern.TextTrack}
* @private
*/
createTextTrack_() {
return {
id: this.nextTextTrackId_++,
active: false,
type: shaka.util.ManifestParserUtils.ContentType.TEXT,
bandwidth: 0,
language: '',
label: null,
kind: null,
mimeType: null,
codecs: null,
primary: false,
roles: [],
accessibilityPurpose: null,
forced: false,
originalTextId: null,
originalLanguage: 'speech-to-text',
};
}
};
/**
* @typedef {{
* audioContext: AudioContext,
* sourceNode: MediaElementAudioSourceNode,
* destinationNode: MediaStreamAudioDestinationNode,
* audioTrack: MediaStreamTrack,
* }}
*/
shaka.text.SpeechToText.AudioObject;
/**
* For now, we never clean this up because if we close the context and
* disconnect from the source, the audio from the video element never
* works again.
*
* @const {!Map<!HTMLMediaElement, shaka.text.SpeechToText.AudioObject>}
* @private
*/
shaka.text.SpeechToText.audioObjectMap_ = new Map();
/**
* @const {!shaka.util.Lazy.<boolean>}
*/
shaka.text.SpeechToText.isMediaStreamTrackSupported =
new shaka.util.Lazy(() => {
// To avoid a permission prompt, we do this test in a temporary iframe.
// Lazy() will make sure it only happens once, and only on demand.
/** @type {HTMLIFrameElement} */
const frame = shaka.util.Dom.asHTMLIFrameElement(
document.body.appendChild(document.createElement('iframe')));
const contentWindow = frame.contentWindow;
const SpeechRecognition = contentWindow.SpeechRecognition ||
contentWindow.webkitSpeechRecognition;
if (!SpeechRecognition) {
frame.remove();
return false;
}
// Run this with the iframe detached from the DOM.
const recognition = /** @type {ChromeSpeechRecognition} */(
new SpeechRecognition());
frame.remove();
try {
// If the new parameter is not used, this start() call succeeds,
// because the 0 gets ignored. If this were running in the main
// window, we would get a microphone permission prompt, but the iframe
// keeps this silent by denying permission immediately.
recognition.start(/** @type {MediaStreamTrack} */(/** @type {?} */(0)));
recognition.stop();
return false;
} catch (error) {
// If the new parameter is checked, we get a TypeError because 0 isn't
// a MediaStreamTrack.
return error.name == 'TypeError';
}
});