From 2e242129b9666df14875b5c7bfd39c9962349696 Mon Sep 17 00:00:00 2001 From: lutangar Date: Fri, 19 Apr 2024 14:09:33 +0200 Subject: [PATCH] chore: add ctranslate2 and timestamped --- .../src/transcription/transcribers.spec.ts | 59 +++++++++++++++++++ .../faster-whisper-transcriber.spec.ts | 28 --------- .../transcriber/openai-transcriber.spec.ts | 28 --------- .../whisper/transcribers.spec.ts | 34 ----------- packages/transcription/src/file-utils.ts | 13 ++++ .../transcription/src/transcriber-factory.ts | 25 ++++++-- packages/transcription/src/whisper/engines.ts | 31 ++++++---- ...nscriber.ts => ctranslate2-transcriber.ts} | 8 ++- .../src/whisper/transcriber/index.ts | 2 + .../whisper/transcriber/openai-transcriber.ts | 12 +++- .../transcriber/timestamped-transcriber.ts | 43 ++++++++++++++ .../transformers-js-transcriber.ts | 13 ++-- .../transcriber/transformers-transcriber.ts | 31 +++++++++- 13 files changed, 207 insertions(+), 120 deletions(-) create mode 100644 packages/tests/src/transcription/transcribers.spec.ts delete mode 100644 packages/tests/src/transcription/whisper/transcriber/faster-whisper-transcriber.spec.ts delete mode 100644 packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts delete mode 100644 packages/tests/src/transcription/whisper/transcribers.spec.ts create mode 100644 packages/transcription/src/file-utils.ts rename packages/transcription/src/whisper/transcriber/{faster-whisper-transcriber.ts => ctranslate2-transcriber.ts} (72%) create mode 100644 packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts diff --git a/packages/tests/src/transcription/transcribers.spec.ts b/packages/tests/src/transcription/transcribers.spec.ts new file mode 100644 index 000000000..e0d6aee13 --- /dev/null +++ b/packages/tests/src/transcription/transcribers.spec.ts @@ -0,0 +1,59 @@ +import { createLogger } from 'winston' +import { join } from 'path' +import { expect } from 'chai' +import { existsSync } from 'node:fs' +import { rm, mkdir, readFile } from 'node:fs/promises' +import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils' +import { transcriberFactory } from '@peertube/peertube-transcription' + +describe('Transcribers', function () { + const transcriptDirectory = join(root(), 'test-transcript') + const vttTranscriptPath = join(transcriptDirectory, 'video_short.vtt') + const transcribers = [ + 'openai-whisper', + 'whisper-ctranslate2', + 'whisper-timestamped' + ] + + before(async function () { + await mkdir(transcriptDirectory, { recursive: true }) + }) + + transcribers.forEach(function (transcriberName) { + describe(`${transcriberName}`, function () { + it(`Should instanciate`, function () { + transcriberFactory.createFromEngineName(transcriberName) + }) + + it('Should run transcription on a media file without raising any errors', async function () { + const transcriber = transcriberFactory.createFromEngineName( + transcriberName, + createLogger(), + transcriptDirectory + ) + const mediaFilePath = buildAbsoluteFixturePath('video_short.mp4') + const transcript = await transcriber.transcribe( + mediaFilePath, + { name: 'tiny' }, + 'fr', + 'vtt' + ) + expect(transcript).to.deep.equals({ + path: vttTranscriptPath, + language: 'fr', + format: 'vtt' + }) + expect(transcript.path).to.equals(vttTranscriptPath) + + expect(existsSync(transcript.path), `Transcript file ${transcript.path} doesn't exist.`).to.be.true + + console.log(await readFile(transcript.path, 'utf8')) + await rm(transcript.path) + }) + }) + }) + + after(async function () { + await rm(transcriptDirectory, { recursive: true, force: true }) + }) +}) diff --git a/packages/tests/src/transcription/whisper/transcriber/faster-whisper-transcriber.spec.ts b/packages/tests/src/transcription/whisper/transcriber/faster-whisper-transcriber.spec.ts deleted file mode 100644 index da30ea5aa..000000000 --- a/packages/tests/src/transcription/whisper/transcriber/faster-whisper-transcriber.spec.ts +++ /dev/null @@ -1,28 +0,0 @@ -import { createLogger } from 'winston' -import { join } from 'path' -import { expect } from 'chai' -import { remove, pathExistsSync } from 'fs-extra/esm.js' -import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils' -import { transcriberFactory } from '@peertube/transcription' - -describe('Open AI Transcriber', function () { - - const transcriptDirectory = join(root(), 'test-transcript') - const vttTranscriptPath = join(transcriptDirectory, 'test.vtt') - - it('Should instanciate', function () { - transcriberFactory.createFromEngineName('faster-whisper') - }) - - it('Should run transcription on a media file without raising any errors', async function () { - const transcriber = transcriberFactory.createFromEngineName('openai-whisper', createLogger(), transcriptDirectory) - const mediaFilePath = buildAbsoluteFixturePath('video_short.mp4') - const transcript = await transcriber.transcribe(mediaFilePath, { name: 'tiny' }, 'fr', 'vtt') - expect(transcript.path).to.equals(vttTranscriptPath) - expect(pathExistsSync(transcript.path), `Transcript file ${transcript.path} doesn't exist`) - }) - - after(async function () { - await remove(transcriptDirectory) - }) -}) diff --git a/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts b/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts deleted file mode 100644 index cc0721c6f..000000000 --- a/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts +++ /dev/null @@ -1,28 +0,0 @@ -import { createLogger } from 'winston' -import { join } from 'path' -import { expect } from 'chai' -import { remove, pathExistsSync } from 'fs-extra/esm.js' -import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils' -import { transcriberFactory } from '@peertube/transcription' - -describe('Open AI Transcriber', function () { - - const transcriptDirectory = join(root(), 'test-transcript') - const vttTranscriptPath = join(transcriptDirectory, 'test.vtt') - - it('Should instanciate', function () { - transcriberFactory.createFromEngineName('openai-whisper') - }) - - it('Should run transcription on a media file without raising any errors', async function () { - const transcriber = transcriberFactory.createFromEngineName('openai-whisper', createLogger(), transcriptDirectory) - const mediaFilePath = buildAbsoluteFixturePath('video_short.mp4') - const transcript = await transcriber.transcribe(mediaFilePath, { name: 'tiny' }, 'fr', 'vtt') - expect(transcript.path).to.equals(vttTranscriptPath) - expect(pathExistsSync(transcript.path), `Transcript file ${transcript.path} doesn't exist`) - }) - - after(async function () { - await remove(transcriptDirectory) - }) -}) diff --git a/packages/tests/src/transcription/whisper/transcribers.spec.ts b/packages/tests/src/transcription/whisper/transcribers.spec.ts deleted file mode 100644 index e1dd9c9b1..000000000 --- a/packages/tests/src/transcription/whisper/transcribers.spec.ts +++ /dev/null @@ -1,34 +0,0 @@ -import { createLogger } from 'winston' -import { join } from 'path' -import { expect } from 'chai' -import { remove, pathExistsSync } from 'fs-extra/esm' -import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils' -import { transcriberFactory } from '@peertube/peertube-transcription' - -describe('Transcribers', function () { - const transcriptDirectory = join(root(), 'test-transcript') - const vttTranscriptPath = join(transcriptDirectory, 'test.vtt') - const transcribers = [ - 'openai-whisper', - 'faster-whisper' - ] - - transcribers.forEach(function (transcriber) { - it(`Should instanciate a ${transcriber} transcriber`, function () { - transcriberFactory.createFromEngineName('openai-whisper') - }) - - it('Should run transcription on a media file without raising any errors', async function () { - const transcriber = transcriberFactory.createFromEngineName('openai-whisper', createLogger(), transcriptDirectory) - const mediaFilePath = buildAbsoluteFixturePath('video_short.mp4') - const transcript = await transcriber.transcribe(mediaFilePath, { name: 'tiny' }, 'fr', 'vtt') - expect(transcript.path).to.equals(vttTranscriptPath) - expect(pathExistsSync(transcript.path), `Transcript file ${transcript.path} doesn't exist`) - }) - - }) - - after(async function () { - await remove(transcriptDirectory) - }) -}) diff --git a/packages/transcription/src/file-utils.ts b/packages/transcription/src/file-utils.ts new file mode 100644 index 000000000..d4d9e2c60 --- /dev/null +++ b/packages/transcription/src/file-utils.ts @@ -0,0 +1,13 @@ +import { basename, extname } from 'path' + +export const getFileInfo = (path: string) => { + const extension = extname(path) + const baseName = basename(path, extension) + const name = `${baseName}${extension}` + + return ({ + extension, + baseName, + name + }) +} diff --git a/packages/transcription/src/transcriber-factory.ts b/packages/transcription/src/transcriber-factory.ts index 230bfaa0e..700c7920d 100644 --- a/packages/transcription/src/transcriber-factory.ts +++ b/packages/transcription/src/transcriber-factory.ts @@ -1,6 +1,9 @@ import { Logger, createLogger } from 'winston' import { TranscriptionEngine } from './transcription-engine.js' -import { TransformersTranscriber, OpenaiTranscriber } from './whisper/index.js' +import { + Ctranslate2Transcriber, + OpenaiTranscriber, WhisperTimestampedTranscriber +} from './whisper/index.js' import { AbstractTranscriber } from './abstract-transcriber.js' export class TranscriberFactory { @@ -10,19 +13,29 @@ export class TranscriberFactory { this.engines = engines } - createFromEngineName (engineName: string, logger: Logger = createLogger(), transcriptDirectory: string = AbstractTranscriber.DEFAULT_TRANSCRIPT_DIRECTORY) { + createFromEngineName ( + engineName: string, + logger: Logger = createLogger(), + transcriptDirectory: string = AbstractTranscriber.DEFAULT_TRANSCRIPT_DIRECTORY + ) { const engine = this.engines.find(({ name }) => name === engineName) if (!engine) { throw new Error(`Unknow engine ${engineName}`) } - const transcriberArgs: ConstructorParameters = [ engine, logger, transcriptDirectory ] + const transcriberArgs: ConstructorParameters = [ + engine, + logger, + transcriptDirectory + ] switch (engineName) { - case 'whisper': + case 'openai-whisper': return new OpenaiTranscriber(...transcriberArgs) - case 'transformers': - return new TransformersTranscriber(...transcriberArgs) + case 'whisper-ctranslate2': + return new Ctranslate2Transcriber(...transcriberArgs) + case 'whisper-timestamped': + return new WhisperTimestampedTranscriber(...transcriberArgs) default: throw new Error(`Unimplemented engine ${engineName}`) } diff --git a/packages/transcription/src/whisper/engines.ts b/packages/transcription/src/whisper/engines.ts index 2f422f93c..cfd93a098 100644 --- a/packages/transcription/src/whisper/engines.ts +++ b/packages/transcription/src/whisper/engines.ts @@ -11,16 +11,16 @@ export const engines: TranscriptionEngine[] = [ license : 'MIT', supportedModelFormats: [ 'ONNX' ] }, - { - name : 'transformers', - description : 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model', - type: 'binary', - language : 'python', - requirements : [], - forgeURL : '', - license : '', - supportedModelFormats: [ 'ONNX' ] - }, + // { + // name : 'transformers', + // description : 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model', + // type: 'binary', + // language : 'python', + // requirements : [], + // forgeURL : '', + // license : '', + // supportedModelFormats: [ 'ONNX' ] + // }, { name: 'openai-whisper', description: 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model', @@ -42,5 +42,16 @@ export const engines: TranscriptionEngine[] = [ forgeURL: 'https://github.com/openai/whisper', license: 'MIT', supportedModelFormats: [ 'CTranslate2' ] + }, + { + name: 'whisper-timestamped', + description: '', + requirements: [ 'python' ], + language: 'python', + type: 'binary', + binary: 'whisper-ctranslate2', + forgeURL: 'https://github.com/openai/whisper', + license: 'MIT', + supportedModelFormats: [ 'CTranslate2' ] } ] diff --git a/packages/transcription/src/whisper/transcriber/faster-whisper-transcriber.ts b/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts similarity index 72% rename from packages/transcription/src/whisper/transcriber/faster-whisper-transcriber.ts rename to packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts index f1a049710..8d9ad9454 100644 --- a/packages/transcription/src/whisper/transcriber/faster-whisper-transcriber.ts +++ b/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts @@ -3,8 +3,9 @@ import { $ } from 'execa' import { TranscriptionModel } from '../../transcription-model.js' import { Transcript, TranscriptFormat } from '../../transcript.js' import { AbstractTranscriber } from '../../abstract-transcriber.js' +import { getFileInfo } from '../../file-utils.js' -export class FasterWhisperTranscriber extends AbstractTranscriber { +export class Ctranslate2Transcriber extends AbstractTranscriber { async transcribe ( mediaFilePath: string, model: TranscriptionModel, @@ -12,8 +13,9 @@ export class FasterWhisperTranscriber extends AbstractTranscriber { format: TranscriptFormat = 'vtt' ): Promise { const $$ = $({ verbose: true }) + const { baseName } = getFileInfo(mediaFilePath) - await $$`whisper ${[ + await $$`whisper-ctranslate2 ${[ mediaFilePath, '--model', model.name, @@ -27,7 +29,7 @@ export class FasterWhisperTranscriber extends AbstractTranscriber { return { language, - path: join(this.transcriptDirectory, `test.${format}`), + path: join(this.transcriptDirectory, `${baseName}.${format}`), format } } diff --git a/packages/transcription/src/whisper/transcriber/index.ts b/packages/transcription/src/whisper/transcriber/index.ts index b4e6e5710..b1d117242 100644 --- a/packages/transcription/src/whisper/transcriber/index.ts +++ b/packages/transcription/src/whisper/transcriber/index.ts @@ -1,3 +1,5 @@ +export * from './ctranslate2-transcriber.js' export * from './transformers-js-transcriber.js' export * from './transformers-transcriber.js' export * from './openai-transcriber.js' +export * from './timestamped-transcriber.js' diff --git a/packages/transcription/src/whisper/transcriber/openai-transcriber.ts b/packages/transcription/src/whisper/transcriber/openai-transcriber.ts index 40c70131e..558660bb1 100644 --- a/packages/transcription/src/whisper/transcriber/openai-transcriber.ts +++ b/packages/transcription/src/whisper/transcriber/openai-transcriber.ts @@ -3,6 +3,7 @@ import { $ } from 'execa' import { TranscriptionModel } from '../../transcription-model.js' import { Transcript, TranscriptFormat } from '../../transcript.js' import { AbstractTranscriber } from '../../abstract-transcriber.js' +import { getFileInfo } from '../../file-utils.js' export class OpenaiTranscriber extends AbstractTranscriber { async transcribe ( @@ -11,9 +12,12 @@ export class OpenaiTranscriber extends AbstractTranscriber { language: string, format: TranscriptFormat = 'vtt' ): Promise { + // Shall we run the command with `{ shell: true }` to get the same error as in sh ? + // ex: ENOENT => Command not found const $$ = $({ verbose: true }) + const { baseName } = getFileInfo(mediaFilePath) - await $$`whisper ${[ + const { stdout } = await $$`whisper ${[ mediaFilePath, '--model', model.name, @@ -22,12 +26,14 @@ export class OpenaiTranscriber extends AbstractTranscriber { '--output_dir', this.transcriptDirectory ]}` + console.log(stdout) - await $$`ls ${this.transcriptDirectory}` + const { stdout: lsStdout } = await $$`ls ${this.transcriptDirectory}` + console.log(lsStdout) return { language, - path: join(this.transcriptDirectory, `test.${format}`), + path: join(this.transcriptDirectory, `${baseName}.${format}`), format } } diff --git a/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts b/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts new file mode 100644 index 000000000..3cbccec90 --- /dev/null +++ b/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts @@ -0,0 +1,43 @@ +import assert from 'node:assert' +import { join } from 'node:path' +import { existsSync } from 'node:fs' +import { rename } from 'node:fs/promises' +import { $ } from 'execa' +import { TranscriptionModel } from '../../transcription-model.js' +import { Transcript, TranscriptFormat } from '../../transcript.js' +import { AbstractTranscriber } from '../../abstract-transcriber.js' +import { getFileInfo } from '../../file-utils.js' + +export class WhisperTimestampedTranscriber extends AbstractTranscriber { + async transcribe ( + mediaFilePath: string, + model: TranscriptionModel, + language: string, + format: TranscriptFormat = 'vtt' + ): Promise { + const $$ = $({ verbose: true }) + const { baseName, name } = getFileInfo(mediaFilePath) + await $$`whisper_timestamped ${[ + mediaFilePath, + '--model', + model.name, + '--output_format', + 'all', + '--output_dir', + this.transcriptDirectory + ]}` + + const internalTranscriptPath = join(this.transcriptDirectory, `${name}.${format}`) + const transcriptPath = join(this.transcriptDirectory, `${baseName}.${format}`) + assert(existsSync(internalTranscriptPath), '') + + await rename(internalTranscriptPath, transcriptPath) + await $$`ls ${this.transcriptDirectory}` + + return { + language, + path: transcriptPath, + format + } + } +} diff --git a/packages/transcription/src/whisper/transcriber/transformers-js-transcriber.ts b/packages/transcription/src/whisper/transcriber/transformers-js-transcriber.ts index c7bb9ab1c..1f0d5a429 100644 --- a/packages/transcription/src/whisper/transcriber/transformers-js-transcriber.ts +++ b/packages/transcription/src/whisper/transcriber/transformers-js-transcriber.ts @@ -1,19 +1,18 @@ -import { TranscriptionModel } from '../../transcription-model.js' -import { AbstractTranscriber } from '../../abstract-transcriber.js' -import { Transcript, TranscriptFormat } from '../../transcript.js' -import { Promise } from 'bluebird' +import { TranscriptionModel } from "../../transcription-model.js"; +import { AbstractTranscriber } from "../../abstract-transcriber.js"; +import { Transcript, TranscriptFormat } from "../../transcript.js"; // Disable local models // env.allowLocalModels = true export class TransformersJsTranscriber extends AbstractTranscriber { - async transcribe ( + async transcribe( mediaFilePath: string, model: TranscriptionModel, language: string, - format: TranscriptFormat = 'vtt' + format: TranscriptFormat = "vtt", ): Promise { - return Promise.resolve(undefined) + return Promise.resolve(undefined); // return pipeline('automatic-speech-recognition', 'no_attentions', { // // For medium models, we need to load the `no_attentions` revision to avoid running out of memory // revision: [].includes('/whisper-medium') ? 'no_attentions' : 'main' diff --git a/packages/transcription/src/whisper/transcriber/transformers-transcriber.ts b/packages/transcription/src/whisper/transcriber/transformers-transcriber.ts index 6341e5bef..e07aff11c 100644 --- a/packages/transcription/src/whisper/transcriber/transformers-transcriber.ts +++ b/packages/transcription/src/whisper/transcriber/transformers-transcriber.ts @@ -1,6 +1,8 @@ import { TranscriptionModel } from '../../transcription-model.js' import { AbstractTranscriber } from '../../abstract-transcriber.js' import { Transcript, TranscriptFormat } from '../../transcript.js' +import { $ } from 'execa' +import { join } from 'path' export class TransformersTranscriber extends AbstractTranscriber { async transcribe ( @@ -9,6 +11,33 @@ export class TransformersTranscriber extends AbstractTranscriber { language: string, format: TranscriptFormat = 'vtt' ): Promise { - return Promise.resolve(undefined) + const $$ = $({ verbose: true }) + // const ffmpegChildProcess = $$`ffmpeg ${[ + // '-i', + // mediaFilePath, + // '-vn', // no video + // '-ar', + // 16000, // set the audio sampling frequency + // '-ac', + // '1', // set the number of audio channels to 1 since Vosk is expecting mono + // '-bufsize', + // 1000, // set a buffer size to provide a steady flow of frames + // '-' + // ]}` + + await $$`transformers-cli ${[ + '--task', + 'automatic-speech-recognition', + '--model', + 'openai/whisper-tiny', + '--input', + mediaFilePath + ]}` + + return { + language, + path: join(this.transcriptDirectory, `test.${format}`), + format + } } }