Merge pull request #5888 from matrix-org/travis/voice/event_type

Expand upon voice message event & include overall waveform
2021-04-22 14:41:59 -06:00 · 2021-04-22 14:41:59 -06:00 · 06726d38fe
parent 21e7847686 14809dfda7
commit 06726d38fe
6 changed files with 214 additions and 20 deletions
--- a/src/@types/global.d.ts
+++ b/src/@types/global.d.ts
@ -129,4 +129,30 @@ declare global {
        // https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/Error/columnNumber
        columnNumber?: number;
    }
+
+    // https://github.com/microsoft/TypeScript/issues/28308#issuecomment-650802278
+    interface AudioWorkletProcessor {
+        readonly port: MessagePort;
+        process(
+            inputs: Float32Array[][],
+            outputs: Float32Array[][],
+            parameters: Record<string, Float32Array>
+        ): boolean;
+    }
+
+    // https://github.com/microsoft/TypeScript/issues/28308#issuecomment-650802278
+    const AudioWorkletProcessor: {
+        prototype: AudioWorkletProcessor;
+        new (options?: AudioWorkletNodeOptions): AudioWorkletProcessor;
+    };
+
+    // https://github.com/microsoft/TypeScript/issues/28308#issuecomment-650802278
+    function registerProcessor(
+        name: string,
+        processorCtor: (new (
+            options?: AudioWorkletNodeOptions
+        ) => AudioWorkletProcessor) & {
+            parameterDescriptors?: AudioParamDescriptor[];
+        }
+    );
 }
--- a/src/components/views/rooms/VoiceRecordComposerTile.tsx
+++ b/src/components/views/rooms/VoiceRecordComposerTile.tsx
@ -53,9 +53,38 @@ export default class VoiceRecordComposerTile extends React.PureComponent<IProps,
            await this.state.recorder.stop();
            const mxc = await this.state.recorder.upload();
            MatrixClientPeg.get().sendMessage(this.props.room.roomId, {
-                body: "Voice message",
-                msgtype: "org.matrix.msc2516.voice",
-                url: mxc,
+                "body": "Voice message",
+                "msgtype": "org.matrix.msc2516.voice",
+                //"msgtype": MsgType.Audio,
+                "url": mxc,
+                "info": {
+                    duration: Math.round(this.state.recorder.durationSeconds * 1000),
+                    mimetype: this.state.recorder.contentType,
+                    size: this.state.recorder.contentLength,
+                },
+
+                // MSC1767 experiment
+                "org.matrix.msc1767.text": "Voice message",
+                "org.matrix.msc1767.file": {
+                    url: mxc,
+                    name: "Voice message.ogg",
+                    mimetype: this.state.recorder.contentType,
+                    size: this.state.recorder.contentLength,
+                },
+                "org.matrix.msc1767.audio": {
+                    duration: Math.round(this.state.recorder.durationSeconds * 1000),
+                    // TODO: @@ TravisR: Waveform? (MSC1767 decision)
+                },
+                "org.matrix.experimental.msc2516.voice": { // MSC2516+MSC1767 experiment
+                    duration: Math.round(this.state.recorder.durationSeconds * 1000),
+
+                    // Events can't have floats, so we try to maintain resolution by using 1024
+                    // as a maximum value. The waveform contains values between zero and 1, so this
+                    // should come out largely sane.
+                    //
+                    // We're expecting about one data point per second of audio.
+                    waveform: this.state.recorder.finalWaveform.map(v => Math.round(v * 1024)),
+                },
            });
            await VoiceRecordingStore.instance.disposeRecording();
            this.setState({recorder: null});
--- a/src/utils/arrays.ts
+++ b/src/utils/arrays.ts
@ -54,7 +54,7 @@ export function arraySeed<T>(val: T, length: number): T[] {
 * @param a The array to clone. Must be defined.
 * @returns A copy of the array.
 */
-export function arrayFastClone(a: any[]): any[] {
+export function arrayFastClone<T>(a: T[]): T[] {
    return a.slice(0, a.length);
 }

--- a/src/voice/RecorderWorklet.ts
+++ b/src/voice/RecorderWorklet.ts
@ -0,0 +1,67 @@
+/*
+Copyright 2021 The Matrix.org Foundation C.I.C.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+import {IAmplitudePayload, ITimingPayload, PayloadEvent, WORKLET_NAME} from "./consts";
+import {percentageOf} from "../utils/numbers";
+
+// from AudioWorkletGlobalScope: https://developer.mozilla.org/en-US/docs/Web/API/AudioWorkletGlobalScope
+declare const currentTime: number;
+// declare const currentFrame: number;
+// declare const sampleRate: number;
+
+class MxVoiceWorklet extends AudioWorkletProcessor {
+    private nextAmplitudeSecond = 0;
+
+    process(inputs, outputs, parameters) {
+        // We only fire amplitude updates once a second to avoid flooding the recording instance
+        // with useless data. Much of the data would end up discarded, so we ratelimit ourselves
+        // here.
+        const currentSecond = Math.round(currentTime);
+        if (currentSecond === this.nextAmplitudeSecond) {
+            // We're expecting exactly one mono input source, so just grab the very first frame of
+            // samples for the analysis.
+            const monoChan = inputs[0][0];
+
+            // The amplitude of the frame's samples is effectively the loudness of the frame. This
+            // translates into a bar which can be rendered as part of the whole recording clip's
+            // waveform.
+            //
+            // We translate the amplitude down to 0-1 for sanity's sake.
+            const minVal = Math.min(...monoChan);
+            const maxVal = Math.max(...monoChan);
+            const amplitude = percentageOf(maxVal, -1, 1) - percentageOf(minVal, -1, 1);
+
+            this.port.postMessage(<IAmplitudePayload>{
+                ev: PayloadEvent.AmplitudeMark,
+                amplitude: amplitude,
+                forSecond: currentSecond,
+            });
+            this.nextAmplitudeSecond++;
+        }
+
+        // We mostly use this worklet to fire regular clock updates through to components
+        this.port.postMessage(<ITimingPayload>{ev: PayloadEvent.Timekeep, timeSeconds: currentTime});
+
+        // We're supposed to return false when we're "done" with the audio clip, but seeing as
+        // we are acting as a passive processor we are never truly "done". The browser will clean
+        // us up when it is done with us.
+        return true;
+    }
+}
+
+registerProcessor(WORKLET_NAME, MxVoiceWorklet);
+
+export default null; // to appease module loaders (we never use the export)
--- a/src/voice/VoiceRecording.ts
+++ b/src/voice/VoiceRecording.ts
@ -23,6 +23,8 @@ import {clamp} from "../utils/numbers";
 import EventEmitter from "events";
 import {IDestroyable} from "../utils/IDestroyable";
 import {Singleflight} from "../utils/Singleflight";
+import {PayloadEvent, WORKLET_NAME} from "./consts";
+import {arrayFastClone} from "../utils/arrays";

 const CHANNELS = 1; // stereo isn't important
 const SAMPLE_RATE = 48000; // 48khz is what WebRTC uses. 12khz is where we lose quality.
@ -49,16 +51,34 @@ export class VoiceRecording extends EventEmitter implements IDestroyable {
    private recorderSource: MediaStreamAudioSourceNode;
    private recorderStream: MediaStream;
    private recorderFFT: AnalyserNode;
-    private recorderProcessor: ScriptProcessorNode;
+    private recorderWorklet: AudioWorkletNode;
    private buffer = new Uint8Array(0);
    private mxc: string;
    private recording = false;
    private observable: SimpleObservable<IRecordingUpdate>;
+    private amplitudes: number[] = []; // at each second mark, generated

    public constructor(private client: MatrixClient) {
        super();
    }

+    public get finalWaveform(): number[] {
+        return arrayFastClone(this.amplitudes);
+    }
+
+    public get contentType(): string {
+        return "audio/ogg";
+    }
+
+    public get contentLength(): number {
+        return this.buffer.length;
+    }
+
+    public get durationSeconds(): number {
+        if (!this.recorder) throw new Error("Duration not available without a recording");
+        return this.recorderContext.currentTime;
+    }
+
    private async makeRecorder() {
        this.recorderStream = await navigator.mediaDevices.getUserMedia({
            audio: {
@ -80,18 +100,34 @@ export class VoiceRecording extends EventEmitter implements IDestroyable {
        // it makes the time domain less than helpful.
        this.recorderFFT.fftSize = 64;

-        // We use an audio processor to get accurate timing information.
-        // The size of the audio buffer largely decides how quickly we push timing/waveform data
-        // out of this class. Smaller buffers mean we update more frequently as we can't hold as
-        // many bytes. Larger buffers mean slower updates. For scale, 1024 gives us about 30Hz of
-        // updates and 2048 gives us about 20Hz. We use 1024 to get as close to perceived realtime
-        // as possible. Must be a power of 2.
-        this.recorderProcessor = this.recorderContext.createScriptProcessor(1024, CHANNELS, CHANNELS);
+        // Set up our worklet. We use this for timing information and waveform analysis: the
+        // web audio API prefers this be done async to avoid holding the main thread with math.
+        const mxRecorderWorkletPath = document.body.dataset.vectorRecorderWorkletScript;
+        if (!mxRecorderWorkletPath) {
+            throw new Error("Unable to create recorder: no worklet script registered");
+        }
+        await this.recorderContext.audioWorklet.addModule(mxRecorderWorkletPath);
+        this.recorderWorklet = new AudioWorkletNode(this.recorderContext, WORKLET_NAME);

        // Connect our inputs and outputs
        this.recorderSource.connect(this.recorderFFT);
-        this.recorderSource.connect(this.recorderProcessor);
-        this.recorderProcessor.connect(this.recorderContext.destination);
+        this.recorderSource.connect(this.recorderWorklet);
+        this.recorderWorklet.connect(this.recorderContext.destination);
+
+        // Dev note: we can't use `addEventListener` for some reason. It just doesn't work.
+        this.recorderWorklet.port.onmessage = (ev) => {
+            switch (ev.data['ev']) {
+                case PayloadEvent.Timekeep:
+                    this.processAudioUpdate(ev.data['timeSeconds']);
+                    break;
+                case PayloadEvent.AmplitudeMark:
+                    // Sanity check to make sure we're adding about one sample per second
+                    if (ev.data['forSecond'] === this.amplitudes.length) {
+                        this.amplitudes.push(ev.data['amplitude']);
+                    }
+                    break;
+            }
+        };

        this.recorder = new Recorder({
            encoderPath, // magic from webpack
@ -138,7 +174,7 @@ export class VoiceRecording extends EventEmitter implements IDestroyable {
        return this.mxc;
    }

-    private processAudioUpdate = (ev: AudioProcessingEvent) => {
+    private processAudioUpdate = (timeSeconds: number) => {
        if (!this.recording) return;

        // The time domain is the input to the FFT, which means we use an array of the same
@ -162,12 +198,12 @@ export class VoiceRecording extends EventEmitter implements IDestroyable {

        this.observable.update({
            waveform: translatedData,
-            timeSeconds: ev.playbackTime,
+            timeSeconds: timeSeconds,
        });

        // Now that we've updated the data/waveform, let's do a time check. We don't want to
        // go horribly over the limit. We also emit a warning state if needed.
-        const secondsLeft = TARGET_MAX_LENGTH - ev.playbackTime;
+        const secondsLeft = TARGET_MAX_LENGTH - timeSeconds;
        if (secondsLeft <= 0) {
            // noinspection JSIgnoredPromiseFromCall - we aren't concerned with it overlapping
            this.stop();
@ -191,7 +227,6 @@ export class VoiceRecording extends EventEmitter implements IDestroyable {
        }
        this.observable = new SimpleObservable<IRecordingUpdate>();
        await this.makeRecorder();
-        this.recorderProcessor.addEventListener("audioprocess", this.processAudioUpdate);
        await this.recorder.start();
        this.recording = true;
        this.emit(RecordingState.Started);
@ -205,6 +240,7 @@ export class VoiceRecording extends EventEmitter implements IDestroyable {

            // Disconnect the source early to start shutting down resources
            this.recorderSource.disconnect();
+            this.recorderWorklet.disconnect();
            await this.recorder.stop();

            // close the context after the recorder so the recorder doesn't try to
@ -216,7 +252,6 @@ export class VoiceRecording extends EventEmitter implements IDestroyable {

            // Finally do our post-processing and clean up
            this.recording = false;
-            this.recorderProcessor.removeEventListener("audioprocess", this.processAudioUpdate);
            await this.recorder.close();
            this.emit(RecordingState.Ended);

@ -240,7 +275,7 @@ export class VoiceRecording extends EventEmitter implements IDestroyable {

        this.emit(RecordingState.Uploading);
        this.mxc = await this.client.uploadContent(new Blob([this.buffer], {
-            type: "audio/ogg",
+            type: this.contentType,
        }), {
            onlyContentUri: false, // to stop the warnings in the console
        }).then(r => r['content_uri']);
--- a/src/voice/consts.ts
+++ b/src/voice/consts.ts
@ -0,0 +1,37 @@
+/*
+Copyright 2021 The Matrix.org Foundation C.I.C.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+export const WORKLET_NAME = "mx-voice-worklet";
+
+export enum PayloadEvent {
+    Timekeep = "timekeep",
+    AmplitudeMark = "amplitude_mark",
+}
+
+export interface IPayload {
+    ev: PayloadEvent;
+}
+
+export interface ITimingPayload extends IPayload {
+    ev: PayloadEvent.Timekeep;
+    timeSeconds: number;
+}
+
+export interface IAmplitudePayload extends IPayload {
+    ev: PayloadEvent.AmplitudeMark;
+    forSecond: number;
+    amplitude: number;
+}