diff --git a/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts b/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts index 250451e03..c76c715ef 100644 --- a/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts +++ b/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts @@ -10,35 +10,29 @@ config.truncateThreshold = 0 describe('Open AI Whisper transcriber', function () { const transcriptDirectory = join(root(), 'test-transcript') - const expectedVttTranscriptPath = join(transcriptDirectory, 'video_short.vtt') + const shortVideoPath = buildAbsoluteFixturePath('video_short.mp4') + + const transcriber = new OpenaiTranscriber( + { + name: 'openai-whisper', + requirements: [], + type: 'binary', + binary: 'whisper', + supportedModelFormats: [ 'PyTorch' ] + }, + createLogger(), + transcriptDirectory + ) before(async function () { await mkdir(transcriptDirectory, { recursive: true }) }) - it('Should transcribe a media file', async function () { - const transcriber = new OpenaiTranscriber( - { - name: 'openai-whisper', - requirements: [], - language: '', - type: 'binary', - license: '', - supportedModelFormats: [ 'PyTorch' ] - }, - createLogger(), - transcriptDirectory - ) - const transcript = await transcriber.transcribe( - buildAbsoluteFixturePath('video_short.mp4'), - { name: 'tiny' }, - 'fr', - 'vtt' - ) - + it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () { + const transcript = await transcriber.transcribe(shortVideoPath) expect(transcript).to.deep.equals({ - path: expectedVttTranscriptPath, - language: 'fr', + path: join(transcriptDirectory, 'video_short.vtt'), + language: 'en', format: 'vtt' }) @@ -54,6 +48,39 @@ You ) }) + it('May produce a transcript file in the `srt` format', async function () { + const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'srt') + expect(transcript).to.deep.equals({ + path: join(transcriptDirectory, 'video_short.srt'), + language: 'en', + format: 'srt' + }) + + // eslint-disable-next-line @typescript-eslint/no-unused-expressions + expect(existsSync(transcript.path), `Transcript file ${transcript.path} doesn't exist.`).to.be.true + expect(await readFile(transcript.path, 'utf8')).to.equal( + `1 +00:00:00,000 --> 00:00:02,000 +You + +` + ) + }) + + it('May produce a transcript file in the `txt` format', async function () { + const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'txt') + expect(transcript).to.deep.equals({ + path: join(transcriptDirectory, 'video_short.txt'), + language: 'en', + format: 'txt' + }) + + // eslint-disable-next-line @typescript-eslint/no-unused-expressions + expect(existsSync(transcript.path), `Transcript file ${transcript.path} doesn't exist.`).to.be.true + expect(await readFile(transcript.path, 'utf8')).to.equal(`You +`) + }) + after(async function () { await rm(transcriptDirectory, { recursive: true, force: true }) }) diff --git a/packages/tests/src/transcription/whisper/transcriber/timestamped-transcriber.spec.ts b/packages/tests/src/transcription/whisper/transcriber/timestamped-transcriber.spec.ts index e0bdc0c9d..fbf53a3d9 100644 --- a/packages/tests/src/transcription/whisper/transcriber/timestamped-transcriber.spec.ts +++ b/packages/tests/src/transcription/whisper/transcriber/timestamped-transcriber.spec.ts @@ -4,56 +4,116 @@ import { expect, config } from 'chai' import { existsSync } from 'node:fs' import { mkdir, readFile, rm } from 'node:fs/promises' import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils' -import { OpenaiTranscriber } from '@peertube/peertube-transcription' +import { OpenaiTranscriber, WhisperTimestampedTranscriber } from '@peertube/peertube-transcription' config.truncateThreshold = 0 describe('Linto timestamped Whisper transcriber', function () { const transcriptDirectory = join(root(), 'test-transcript') - const expectedVttTranscriptPath = join(transcriptDirectory, 'video_short.vtt') + const shortVideoPath = buildAbsoluteFixturePath('video_short.mp4') + const transcriber = new WhisperTimestampedTranscriber( + { + name: 'whisper-timestamped', + requirements: [], + type: 'binary', + binary: 'whisper_timestamped', + supportedModelFormats: [ 'PyTorch' ] + }, + createLogger(), + transcriptDirectory + ) before(async function () { await mkdir(transcriptDirectory, { recursive: true }) }) - it('Should transcribe a media file', async function () { - const transcriber = new OpenaiTranscriber( - { - name: 'timestamped-whisper', - requirements: [], - language: '', - type: 'binary', - license: '', - supportedModelFormats: [ 'PyTorch' ] - }, - createLogger(), - transcriptDirectory - ) + it('Should transcribe a media file and produce transcript file in th `vtt` format by default', async function () { const transcript = await transcriber.transcribe( - buildAbsoluteFixturePath('video_short.mp4'), + shortVideoPath, { name: 'tiny' }, 'fr', 'vtt' ) expect(transcript).to.deep.equals({ - path: expectedVttTranscriptPath, + path: join(transcriptDirectory, 'video_short.vtt'), language: 'fr', format: 'vtt' }) // eslint-disable-next-line @typescript-eslint/no-unused-expressions expect(existsSync(transcript.path), `Transcript file ${transcript.path} doesn't exist.`).to.be.true + + // Whisper timestamped should produce a transcript with micro seconds precisions. expect(await readFile(transcript.path, 'utf8')).to.equal( `WEBVTT -00:00.000 --> 00:02.000 -You +00:02.480 --> 00:02.500 +you ` ) }) + it('May produce a transcript file in the `srt` format', async function () { + const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'srt') + expect(transcript).to.deep.equals({ + path: join(transcriptDirectory, 'video_short.srt'), + language: 'en', + format: 'srt' + }) + + // eslint-disable-next-line @typescript-eslint/no-unused-expressions + expect(existsSync(transcript.path), `Transcript file ${transcript.path} doesn't exist.`).to.be.true + expect(await readFile(transcript.path, 'utf8')).to.equal( + `1 +00:00:02,480 --> 00:00:02,500 +you + +` + ) + }) + + it('May produce a transcript file in `txt` format', async function () { + const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'txt') + expect(transcript).to.deep.equals({ + path: join(transcriptDirectory, 'video_short.txt'), + language: 'en', + format: 'txt' + }) + + // eslint-disable-next-line @typescript-eslint/no-unused-expressions + expect(existsSync(transcript.path), `Transcript file ${transcript.path} doesn't exist.`).to.be.true + expect(await readFile(transcript.path, 'utf8')).to.equal(`You + `) + }) + + it('Should produce the same transcript text as openai-whisper given the same parameters', async function () { + const transcribeArguments: Parameters = [ + shortVideoPath, + { name: 'tiny' }, + 'en', + 'txt' + ] + const transcript = await transcriber.transcribe(...transcribeArguments) + const openaiTranscriber = new OpenaiTranscriber( + { + name: 'openai-whisper', + requirements: [], + type: 'binary', + binary: 'whisper', + supportedModelFormats: [ 'PyTorch' ] + }, + createLogger(), + join(transcriptDirectory, 'openai-whisper') + ) + const openaiTranscript = await openaiTranscriber.transcribe(...transcribeArguments) + + // eslint-disable-next-line @typescript-eslint/no-unused-expressions + expect(existsSync(transcript.path), `Transcript file ${transcript.path} doesn't exist.`).to.be.true + expect(await readFile(transcript.path, 'utf8')).to.equal(await readFile(openaiTranscript.path, 'utf8')) + }) + after(async function () { await rm(transcriptDirectory, { recursive: true, force: true }) }) diff --git a/packages/tests/src/transcription/whisper/transcriber/whisper-ctranslate2.spec.ts b/packages/tests/src/transcription/whisper/transcriber/whisper-ctranslate2.spec.ts index b170c5e7e..d2b17a680 100644 --- a/packages/tests/src/transcription/whisper/transcriber/whisper-ctranslate2.spec.ts +++ b/packages/tests/src/transcription/whisper/transcriber/whisper-ctranslate2.spec.ts @@ -4,41 +4,34 @@ import { expect, config } from 'chai' import { existsSync } from 'node:fs' import { mkdir, readFile, rm } from 'node:fs/promises' import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils' -import { OpenaiTranscriber } from '@peertube/peertube-transcription' +import { Ctranslate2Transcriber, OpenaiTranscriber } from '@peertube/peertube-transcription' config.truncateThreshold = 0 describe('Whisper CTranslate2 transcriber', function () { const transcriptDirectory = join(root(), 'test-transcript') - const expectedVttTranscriptPath = join(transcriptDirectory, 'video_short.vtt') + const shortVideoPath = buildAbsoluteFixturePath('video_short.mp4') + const transcriber = new Ctranslate2Transcriber( + { + name: 'anyNameShouldBeFineReally', + requirements: [], + type: 'binary', + binary: 'whisper-ctranslate2', + supportedModelFormats: [] + }, + createLogger(), + transcriptDirectory + ) before(async function () { await mkdir(transcriptDirectory, { recursive: true }) }) - it('Should transcribe a media file', async function () { - const transcriber = new OpenaiTranscriber( - { - name: 'whisper-ctranslate2', - requirements: [], - language: '', - type: 'binary', - license: '', - supportedModelFormats: [] - }, - createLogger(), - transcriptDirectory - ) - const transcript = await transcriber.transcribe( - buildAbsoluteFixturePath('video_short.mp4'), - { name: 'tiny' }, - 'fr', - 'vtt' - ) - + it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () { + const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }) expect(transcript).to.deep.equals({ - path: expectedVttTranscriptPath, - language: 'fr', + path: join(transcriptDirectory, 'video_short.vtt'), + language: 'en', format: 'vtt' }) @@ -54,6 +47,65 @@ You ) }) + it('May produce a transcript file in the `srt` format', async function () { + const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'srt') + expect(transcript).to.deep.equals({ + path: join(transcriptDirectory, 'video_short.srt'), + language: 'en', + format: 'srt' + }) + + // eslint-disable-next-line @typescript-eslint/no-unused-expressions + expect(existsSync(transcript.path), `Transcript file ${transcript.path} doesn't exist.`).to.be.true + expect(await readFile(transcript.path, 'utf8')).to.equal( + `1 +00:00:00,000 --> 00:00:02,000 +You + +` + ) + }) + + it('May produce a transcript file in the `txt` format', async function () { + const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'txt') + expect(transcript).to.deep.equals({ + path: join(transcriptDirectory, 'video_short.txt'), + language: 'en', + format: 'txt' + }) + + // eslint-disable-next-line @typescript-eslint/no-unused-expressions + expect(existsSync(transcript.path), `Transcript file ${transcript.path} doesn't exist.`).to.be.true + expect(await readFile(transcript.path, 'utf8')).to.equal(`You +`) + }) + + it('Should produce the same transcript text as openai-whisper given the same parameters', async function () { + const transcribeArguments: Parameters = [ + shortVideoPath, + { name: 'tiny' }, + 'en', + 'txt' + ] + const transcript = await transcriber.transcribe(...transcribeArguments) + const openaiTranscriber = new OpenaiTranscriber( + { + name: 'openai-whisper', + requirements: [], + type: 'binary', + binary: 'whisper', + supportedModelFormats: [ 'PyTorch' ] + }, + createLogger(), + join(transcriptDirectory, 'openai-whisper') + ) + const openaiTranscript = await openaiTranscriber.transcribe(...transcribeArguments) + + // eslint-disable-next-line @typescript-eslint/no-unused-expressions + expect(existsSync(transcript.path), `Transcript file ${transcript.path} doesn't exist.`).to.be.true + expect(await readFile(transcript.path, 'utf8')).to.equal(await readFile(openaiTranscript.path, 'utf8')) + }) + after(async function () { await rm(transcriptDirectory, { recursive: true, force: true }) }) diff --git a/packages/transcription/src/transcription-engine.ts b/packages/transcription/src/transcription-engine.ts index 24e1953cd..df00f5a4c 100644 --- a/packages/transcription/src/transcription-engine.ts +++ b/packages/transcription/src/transcription-engine.ts @@ -6,11 +6,11 @@ import { ModelFormat } from './transcription-model.js' export interface TranscriptionEngine { name: string description?: string - language: string + language?: string requirements: string[] type: 'binary' | 'bindings' | 'ws' - binary?: string - license: string + binary: string + license?: string forgeURL?: string supportedModelFormats: ModelFormat[] diff --git a/packages/transcription/src/whisper/engines.ts b/packages/transcription/src/whisper/engines.ts index cfd93a098..90a768461 100644 --- a/packages/transcription/src/whisper/engines.ts +++ b/packages/transcription/src/whisper/engines.ts @@ -5,6 +5,7 @@ export const engines: TranscriptionEngine[] = [ name : 'whisper-cpp', description : 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model', type: 'binary', + binary: 'main', language : 'cpp', requirements : [], forgeURL : 'https://github.com/ggerganov/whisper.cpp', @@ -49,7 +50,7 @@ export const engines: TranscriptionEngine[] = [ requirements: [ 'python' ], language: 'python', type: 'binary', - binary: 'whisper-ctranslate2', + binary: 'whisper_timestamped', forgeURL: 'https://github.com/openai/whisper', license: 'MIT', supportedModelFormats: [ 'CTranslate2' ] diff --git a/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts b/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts index 1a0a132db..0b98708b1 100644 --- a/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts +++ b/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts @@ -1,37 +1,5 @@ -import { join } from 'path' -import { $ } from 'execa' -import { TranscriptionModel } from '../../transcription-model.js' -import { Transcript, TranscriptFormat } from '../../transcript.js' -import { AbstractTranscriber } from '../../abstract-transcriber.js' -import { getFileInfo } from '../../file-utils.js' +import { OpenaiTranscriber } from './openai-transcriber.js' -export class Ctranslate2Transcriber extends AbstractTranscriber { - async transcribe ( - mediaFilePath: string, - model: TranscriptionModel, - language: string, - format: TranscriptFormat = 'vtt' - ): Promise { - this.createPerformanceMark() - const $$ = $({ verbose: true }) - const { baseName } = getFileInfo(mediaFilePath) +export class Ctranslate2Transcriber extends OpenaiTranscriber { - await $$`whisper-ctranslate2 ${[ - mediaFilePath, - '--model', - model.name, - '--output_format', - 'all', - '--output_dir', - this.transcriptDirectory - ]}` - - this.measurePerformanceMark() - - return { - language, - path: join(this.transcriptDirectory, `${baseName}.${format}`), - format - } - } } diff --git a/packages/transcription/src/whisper/transcriber/openai-transcriber.ts b/packages/transcription/src/whisper/transcriber/openai-transcriber.ts index da4556ff9..3ef65a93b 100644 --- a/packages/transcription/src/whisper/transcriber/openai-transcriber.ts +++ b/packages/transcription/src/whisper/transcriber/openai-transcriber.ts @@ -8,8 +8,8 @@ import { getFileInfo } from '../../file-utils.js' export class OpenaiTranscriber extends AbstractTranscriber { async transcribe ( mediaFilePath: string, - model: TranscriptionModel, - language: string, + model: TranscriptionModel = { name: 'tiny' }, + language: string = 'en', format: TranscriptFormat = 'vtt' ): Promise { this.createPerformanceMark() @@ -18,14 +18,16 @@ export class OpenaiTranscriber extends AbstractTranscriber { const $$ = $({ verbose: true }) const { baseName } = getFileInfo(mediaFilePath) - await $$`whisper ${[ + await $$`${this.engine.binary} ${[ mediaFilePath, '--model', model.name, '--output_format', 'all', '--output_dir', - this.transcriptDirectory + this.transcriptDirectory, + '--language', + language ]}` this.measurePerformanceMark() diff --git a/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts b/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts index 2c904f2e1..0dc7fc0a0 100644 --- a/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts +++ b/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts @@ -5,10 +5,10 @@ import { rename } from 'node:fs/promises' import { $ } from 'execa' import { TranscriptionModel } from '../../transcription-model.js' import { Transcript, TranscriptFormat } from '../../transcript.js' -import { AbstractTranscriber } from '../../abstract-transcriber.js' import { getFileInfo } from '../../file-utils.js' +import { OpenaiTranscriber } from './openai-transcriber.js' -export class WhisperTimestampedTranscriber extends AbstractTranscriber { +export class WhisperTimestampedTranscriber extends OpenaiTranscriber { async transcribe ( mediaFilePath: string, model: TranscriptionModel, @@ -19,7 +19,7 @@ export class WhisperTimestampedTranscriber extends AbstractTranscriber { const $$ = $({ verbose: true }) const { baseName, name } = getFileInfo(mediaFilePath) - await $$`whisper_timestamped ${[ + await $$`${this.engine.binary} ${[ mediaFilePath, '--model', model.name, @@ -31,7 +31,8 @@ export class WhisperTimestampedTranscriber extends AbstractTranscriber { const internalTranscriptPath = join(this.transcriptDirectory, `${name}.${format}`) const transcriptPath = join(this.transcriptDirectory, `${baseName}.${format}`) - assert(existsSync(internalTranscriptPath), '') + // Whisper timestamped is supposed to output file with the video file extension ex: video.mp4.vtt + assert(existsSync(internalTranscriptPath), `${internalTranscriptPath} file doesn't exist.`) await rename(internalTranscriptPath, transcriptPath) this.measurePerformanceMark()