chore: add ctranslate2 and timestamped

pull/6303/head
lutangar 2024-04-19 14:09:33 +02:00
parent 47095673b3
commit 2e242129b9
13 changed files with 207 additions and 120 deletions

View File

@ -0,0 +1,59 @@
import { createLogger } from 'winston'
import { join } from 'path'
import { expect } from 'chai'
import { existsSync } from 'node:fs'
import { rm, mkdir, readFile } from 'node:fs/promises'
import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils'
import { transcriberFactory } from '@peertube/peertube-transcription'
describe('Transcribers', function () {
const transcriptDirectory = join(root(), 'test-transcript')
const vttTranscriptPath = join(transcriptDirectory, 'video_short.vtt')
const transcribers = [
'openai-whisper',
'whisper-ctranslate2',
'whisper-timestamped'
]
before(async function () {
await mkdir(transcriptDirectory, { recursive: true })
})
transcribers.forEach(function (transcriberName) {
describe(`${transcriberName}`, function () {
it(`Should instanciate`, function () {
transcriberFactory.createFromEngineName(transcriberName)
})
it('Should run transcription on a media file without raising any errors', async function () {
const transcriber = transcriberFactory.createFromEngineName(
transcriberName,
createLogger(),
transcriptDirectory
)
const mediaFilePath = buildAbsoluteFixturePath('video_short.mp4')
const transcript = await transcriber.transcribe(
mediaFilePath,
{ name: 'tiny' },
'fr',
'vtt'
)
expect(transcript).to.deep.equals({
path: vttTranscriptPath,
language: 'fr',
format: 'vtt'
})
expect(transcript.path).to.equals(vttTranscriptPath)
expect(existsSync(transcript.path), `Transcript file ${transcript.path} doesn't exist.`).to.be.true
console.log(await readFile(transcript.path, 'utf8'))
await rm(transcript.path)
})
})
})
after(async function () {
await rm(transcriptDirectory, { recursive: true, force: true })
})
})

View File

@ -1,28 +0,0 @@
import { createLogger } from 'winston'
import { join } from 'path'
import { expect } from 'chai'
import { remove, pathExistsSync } from 'fs-extra/esm.js'
import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils'
import { transcriberFactory } from '@peertube/transcription'
describe('Open AI Transcriber', function () {
const transcriptDirectory = join(root(), 'test-transcript')
const vttTranscriptPath = join(transcriptDirectory, 'test.vtt')
it('Should instanciate', function () {
transcriberFactory.createFromEngineName('faster-whisper')
})
it('Should run transcription on a media file without raising any errors', async function () {
const transcriber = transcriberFactory.createFromEngineName('openai-whisper', createLogger(), transcriptDirectory)
const mediaFilePath = buildAbsoluteFixturePath('video_short.mp4')
const transcript = await transcriber.transcribe(mediaFilePath, { name: 'tiny' }, 'fr', 'vtt')
expect(transcript.path).to.equals(vttTranscriptPath)
expect(pathExistsSync(transcript.path), `Transcript file ${transcript.path} doesn't exist`)
})
after(async function () {
await remove(transcriptDirectory)
})
})

View File

@ -1,28 +0,0 @@
import { createLogger } from 'winston'
import { join } from 'path'
import { expect } from 'chai'
import { remove, pathExistsSync } from 'fs-extra/esm.js'
import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils'
import { transcriberFactory } from '@peertube/transcription'
describe('Open AI Transcriber', function () {
const transcriptDirectory = join(root(), 'test-transcript')
const vttTranscriptPath = join(transcriptDirectory, 'test.vtt')
it('Should instanciate', function () {
transcriberFactory.createFromEngineName('openai-whisper')
})
it('Should run transcription on a media file without raising any errors', async function () {
const transcriber = transcriberFactory.createFromEngineName('openai-whisper', createLogger(), transcriptDirectory)
const mediaFilePath = buildAbsoluteFixturePath('video_short.mp4')
const transcript = await transcriber.transcribe(mediaFilePath, { name: 'tiny' }, 'fr', 'vtt')
expect(transcript.path).to.equals(vttTranscriptPath)
expect(pathExistsSync(transcript.path), `Transcript file ${transcript.path} doesn't exist`)
})
after(async function () {
await remove(transcriptDirectory)
})
})

View File

@ -1,34 +0,0 @@
import { createLogger } from 'winston'
import { join } from 'path'
import { expect } from 'chai'
import { remove, pathExistsSync } from 'fs-extra/esm'
import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils'
import { transcriberFactory } from '@peertube/peertube-transcription'
describe('Transcribers', function () {
const transcriptDirectory = join(root(), 'test-transcript')
const vttTranscriptPath = join(transcriptDirectory, 'test.vtt')
const transcribers = [
'openai-whisper',
'faster-whisper'
]
transcribers.forEach(function (transcriber) {
it(`Should instanciate a ${transcriber} transcriber`, function () {
transcriberFactory.createFromEngineName('openai-whisper')
})
it('Should run transcription on a media file without raising any errors', async function () {
const transcriber = transcriberFactory.createFromEngineName('openai-whisper', createLogger(), transcriptDirectory)
const mediaFilePath = buildAbsoluteFixturePath('video_short.mp4')
const transcript = await transcriber.transcribe(mediaFilePath, { name: 'tiny' }, 'fr', 'vtt')
expect(transcript.path).to.equals(vttTranscriptPath)
expect(pathExistsSync(transcript.path), `Transcript file ${transcript.path} doesn't exist`)
})
})
after(async function () {
await remove(transcriptDirectory)
})
})

View File

@ -0,0 +1,13 @@
import { basename, extname } from 'path'
export const getFileInfo = (path: string) => {
const extension = extname(path)
const baseName = basename(path, extension)
const name = `${baseName}${extension}`
return ({
extension,
baseName,
name
})
}

View File

@ -1,6 +1,9 @@
import { Logger, createLogger } from 'winston'
import { TranscriptionEngine } from './transcription-engine.js'
import { TransformersTranscriber, OpenaiTranscriber } from './whisper/index.js'
import {
Ctranslate2Transcriber,
OpenaiTranscriber, WhisperTimestampedTranscriber
} from './whisper/index.js'
import { AbstractTranscriber } from './abstract-transcriber.js'
export class TranscriberFactory {
@ -10,19 +13,29 @@ export class TranscriberFactory {
this.engines = engines
}
createFromEngineName (engineName: string, logger: Logger = createLogger(), transcriptDirectory: string = AbstractTranscriber.DEFAULT_TRANSCRIPT_DIRECTORY) {
createFromEngineName (
engineName: string,
logger: Logger = createLogger(),
transcriptDirectory: string = AbstractTranscriber.DEFAULT_TRANSCRIPT_DIRECTORY
) {
const engine = this.engines.find(({ name }) => name === engineName)
if (!engine) {
throw new Error(`Unknow engine ${engineName}`)
}
const transcriberArgs: ConstructorParameters<typeof AbstractTranscriber> = [ engine, logger, transcriptDirectory ]
const transcriberArgs: ConstructorParameters<typeof AbstractTranscriber> = [
engine,
logger,
transcriptDirectory
]
switch (engineName) {
case 'whisper':
case 'openai-whisper':
return new OpenaiTranscriber(...transcriberArgs)
case 'transformers':
return new TransformersTranscriber(...transcriberArgs)
case 'whisper-ctranslate2':
return new Ctranslate2Transcriber(...transcriberArgs)
case 'whisper-timestamped':
return new WhisperTimestampedTranscriber(...transcriberArgs)
default:
throw new Error(`Unimplemented engine ${engineName}`)
}

View File

@ -11,16 +11,16 @@ export const engines: TranscriptionEngine[] = [
license : 'MIT',
supportedModelFormats: [ 'ONNX' ]
},
{
name : 'transformers',
description : 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model',
type: 'binary',
language : 'python',
requirements : [],
forgeURL : '',
license : '',
supportedModelFormats: [ 'ONNX' ]
},
// {
// name : 'transformers',
// description : 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model',
// type: 'binary',
// language : 'python',
// requirements : [],
// forgeURL : '',
// license : '',
// supportedModelFormats: [ 'ONNX' ]
// },
{
name: 'openai-whisper',
description: 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model',
@ -42,5 +42,16 @@ export const engines: TranscriptionEngine[] = [
forgeURL: 'https://github.com/openai/whisper',
license: 'MIT',
supportedModelFormats: [ 'CTranslate2' ]
},
{
name: 'whisper-timestamped',
description: '',
requirements: [ 'python' ],
language: 'python',
type: 'binary',
binary: 'whisper-ctranslate2',
forgeURL: 'https://github.com/openai/whisper',
license: 'MIT',
supportedModelFormats: [ 'CTranslate2' ]
}
]

View File

@ -3,8 +3,9 @@ import { $ } from 'execa'
import { TranscriptionModel } from '../../transcription-model.js'
import { Transcript, TranscriptFormat } from '../../transcript.js'
import { AbstractTranscriber } from '../../abstract-transcriber.js'
import { getFileInfo } from '../../file-utils.js'
export class FasterWhisperTranscriber extends AbstractTranscriber {
export class Ctranslate2Transcriber extends AbstractTranscriber {
async transcribe (
mediaFilePath: string,
model: TranscriptionModel,
@ -12,8 +13,9 @@ export class FasterWhisperTranscriber extends AbstractTranscriber {
format: TranscriptFormat = 'vtt'
): Promise<Transcript> {
const $$ = $({ verbose: true })
const { baseName } = getFileInfo(mediaFilePath)
await $$`whisper ${[
await $$`whisper-ctranslate2 ${[
mediaFilePath,
'--model',
model.name,
@ -27,7 +29,7 @@ export class FasterWhisperTranscriber extends AbstractTranscriber {
return {
language,
path: join(this.transcriptDirectory, `test.${format}`),
path: join(this.transcriptDirectory, `${baseName}.${format}`),
format
}
}

View File

@ -1,3 +1,5 @@
export * from './ctranslate2-transcriber.js'
export * from './transformers-js-transcriber.js'
export * from './transformers-transcriber.js'
export * from './openai-transcriber.js'
export * from './timestamped-transcriber.js'

View File

@ -3,6 +3,7 @@ import { $ } from 'execa'
import { TranscriptionModel } from '../../transcription-model.js'
import { Transcript, TranscriptFormat } from '../../transcript.js'
import { AbstractTranscriber } from '../../abstract-transcriber.js'
import { getFileInfo } from '../../file-utils.js'
export class OpenaiTranscriber extends AbstractTranscriber {
async transcribe (
@ -11,9 +12,12 @@ export class OpenaiTranscriber extends AbstractTranscriber {
language: string,
format: TranscriptFormat = 'vtt'
): Promise<Transcript> {
// Shall we run the command with `{ shell: true }` to get the same error as in sh ?
// ex: ENOENT => Command not found
const $$ = $({ verbose: true })
const { baseName } = getFileInfo(mediaFilePath)
await $$`whisper ${[
const { stdout } = await $$`whisper ${[
mediaFilePath,
'--model',
model.name,
@ -22,12 +26,14 @@ export class OpenaiTranscriber extends AbstractTranscriber {
'--output_dir',
this.transcriptDirectory
]}`
console.log(stdout)
await $$`ls ${this.transcriptDirectory}`
const { stdout: lsStdout } = await $$`ls ${this.transcriptDirectory}`
console.log(lsStdout)
return {
language,
path: join(this.transcriptDirectory, `test.${format}`),
path: join(this.transcriptDirectory, `${baseName}.${format}`),
format
}
}

View File

@ -0,0 +1,43 @@
import assert from 'node:assert'
import { join } from 'node:path'
import { existsSync } from 'node:fs'
import { rename } from 'node:fs/promises'
import { $ } from 'execa'
import { TranscriptionModel } from '../../transcription-model.js'
import { Transcript, TranscriptFormat } from '../../transcript.js'
import { AbstractTranscriber } from '../../abstract-transcriber.js'
import { getFileInfo } from '../../file-utils.js'
export class WhisperTimestampedTranscriber extends AbstractTranscriber {
async transcribe (
mediaFilePath: string,
model: TranscriptionModel,
language: string,
format: TranscriptFormat = 'vtt'
): Promise<Transcript> {
const $$ = $({ verbose: true })
const { baseName, name } = getFileInfo(mediaFilePath)
await $$`whisper_timestamped ${[
mediaFilePath,
'--model',
model.name,
'--output_format',
'all',
'--output_dir',
this.transcriptDirectory
]}`
const internalTranscriptPath = join(this.transcriptDirectory, `${name}.${format}`)
const transcriptPath = join(this.transcriptDirectory, `${baseName}.${format}`)
assert(existsSync(internalTranscriptPath), '')
await rename(internalTranscriptPath, transcriptPath)
await $$`ls ${this.transcriptDirectory}`
return {
language,
path: transcriptPath,
format
}
}
}

View File

@ -1,19 +1,18 @@
import { TranscriptionModel } from '../../transcription-model.js'
import { AbstractTranscriber } from '../../abstract-transcriber.js'
import { Transcript, TranscriptFormat } from '../../transcript.js'
import { Promise } from 'bluebird'
import { TranscriptionModel } from "../../transcription-model.js";
import { AbstractTranscriber } from "../../abstract-transcriber.js";
import { Transcript, TranscriptFormat } from "../../transcript.js";
// Disable local models
// env.allowLocalModels = true
export class TransformersJsTranscriber extends AbstractTranscriber {
async transcribe (
async transcribe(
mediaFilePath: string,
model: TranscriptionModel,
language: string,
format: TranscriptFormat = 'vtt'
format: TranscriptFormat = "vtt",
): Promise<Transcript> {
return Promise.resolve(undefined)
return Promise.resolve(undefined);
// return pipeline('automatic-speech-recognition', 'no_attentions', {
// // For medium models, we need to load the `no_attentions` revision to avoid running out of memory
// revision: [].includes('/whisper-medium') ? 'no_attentions' : 'main'

View File

@ -1,6 +1,8 @@
import { TranscriptionModel } from '../../transcription-model.js'
import { AbstractTranscriber } from '../../abstract-transcriber.js'
import { Transcript, TranscriptFormat } from '../../transcript.js'
import { $ } from 'execa'
import { join } from 'path'
export class TransformersTranscriber extends AbstractTranscriber {
async transcribe (
@ -9,6 +11,33 @@ export class TransformersTranscriber extends AbstractTranscriber {
language: string,
format: TranscriptFormat = 'vtt'
): Promise<Transcript> {
return Promise.resolve(undefined)
const $$ = $({ verbose: true })
// const ffmpegChildProcess = $$`ffmpeg ${[
// '-i',
// mediaFilePath,
// '-vn', // no video
// '-ar',
// 16000, // set the audio sampling frequency
// '-ac',
// '1', // set the number of audio channels to 1 since Vosk is expecting mono
// '-bufsize',
// 1000, // set a buffer size to provide a steady flow of frames
// '-'
// ]}`
await $$`transformers-cli ${[
'--task',
'automatic-speech-recognition',
'--model',
'openai/whisper-tiny',
'--input',
mediaFilePath
]}`
return {
language,
path: join(this.transcriptDirectory, `test.${format}`),
format
}
}
}