Johan Dufour 2024-04-30 09:03:30 +00:00 committed by GitHub
commit ef6a3da32f
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
45 changed files with 168523 additions and 1 deletions

1
.gitattributes vendored Normal file
View File

@ -0,0 +1 @@
packages/tests/fixtures/transcription/models/**/* linguist-generated=true

36
.github/workflows/transcription.yml vendored Normal file
View File

@ -0,0 +1,36 @@
name: Transcription
on:
push:
branches:
- transcription-backend-workbench
jobs:
test:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- uses: './.github/actions/reusable-prepare-peertube-build'
with:
node-version: '18.x'
- uses: './.github/actions/reusable-prepare-peertube-run'
- name: Install Python libraries
run: |
pip3 install openai-whisper
pip3 install whisper-ctranslate2
pip3 install whisper-timestamped
pip3 install jiwer
- name: Run transcription tests
run: |
npm run mocha -- --exit --bail "packages/tests/src/transcription/**/*.spec.ts"
cat /proc/cpuinfo
lscpu
dmidecode --type processor
lshw -C CPU
hwinfo --cpu

View File

@ -0,0 +1,6 @@
🇫🇷 DRANE Occitanie - Communiquer lors d'une classe transplantée
[./communiquer-lors-dune-classe-transplantee.mp4](videos/communiquer-lors-dune-classe-transplantee.mp4)
> https://podeduc.apps.education.fr/numerique-educatif/video/21893-communiquer-lors-dune-classe-transplantee/
>
> CC BY-NC-SA 4.0 Deed
> Attribution-NonCommercial-ShareAlike 4.0 International

View File

@ -0,0 +1,223 @@
{
"alignment_heads": [
[
2,
2
],
[
3,
0
],
[
3,
2
],
[
3,
3
],
[
3,
4
],
[
3,
5
]
],
"lang_ids": [
50259,
50260,
50261,
50262,
50263,
50264,
50265,
50266,
50267,
50268,
50269,
50270,
50271,
50272,
50273,
50274,
50275,
50276,
50277,
50278,
50279,
50280,
50281,
50282,
50283,
50284,
50285,
50286,
50287,
50288,
50289,
50290,
50291,
50292,
50293,
50294,
50295,
50296,
50297,
50298,
50299,
50300,
50301,
50302,
50303,
50304,
50305,
50306,
50307,
50308,
50309,
50310,
50311,
50312,
50313,
50314,
50315,
50316,
50317,
50318,
50319,
50320,
50321,
50322,
50323,
50324,
50325,
50326,
50327,
50328,
50329,
50330,
50331,
50332,
50333,
50334,
50335,
50336,
50337,
50338,
50339,
50340,
50341,
50342,
50343,
50344,
50345,
50346,
50347,
50348,
50349,
50350,
50351,
50352,
50353,
50354,
50355,
50356,
50357
],
"suppress_ids": [
1,
2,
7,
8,
9,
10,
14,
25,
26,
27,
28,
29,
31,
58,
59,
60,
61,
62,
63,
90,
91,
92,
93,
359,
503,
522,
542,
873,
893,
902,
918,
922,
931,
1350,
1853,
1982,
2460,
2627,
3246,
3253,
3268,
3536,
3846,
3961,
4183,
4667,
6585,
6647,
7273,
9061,
9383,
10428,
10929,
11938,
12033,
12331,
12562,
13793,
14157,
14635,
15265,
15618,
16553,
16604,
18362,
18956,
20075,
21675,
22520,
26130,
26161,
26435,
28279,
29464,
31650,
32302,
32470,
36865,
42863,
47425,
49870,
50254,
50258,
50358,
50359,
50360,
50361,
50362
],
"suppress_ids_begin": [
220,
50257
]
}

Binary file not shown.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

BIN
packages/tests/fixtures/transcription/models/tiny.pt generated vendored Normal file

Binary file not shown.

View File

@ -0,0 +1,10 @@
Communiquer lors d'une classe transplantée. Utiliser les photos prises lors de cette classe pour raconter quotidiennement le séjour vécu.
C'est le scénario pédagogique présenté par Monsieur Navoli, professeur en cycle 3 sur une école élémentaire de Montpellier.
La première application utilisée sera la médiathèque. L'enseignant va alors transférer les différentes photos réalisées lors de la classe transplantée.
Dans un dossier spécifique pour que les élèves puissent le retrouver plus facilement. Il téléverse donc ses photos dans le dossier, dans l'ENT, dans la médiathèque de la classe.
Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser.
Les élèves par la suite utiliseront le blog, à partir de leurs notes, il pourront, seul ou à 2 par poste rédiger un article dans leur ENT.
Ils illustreront ces articles à l'aide des photos et documents numériques mis en accès libre dans l'ENT.
Pour ce faire, il pourront utiliser l'éditeur avancé qui les renverra directement dans la médiathèque de la classe, où ils pourront retrouver le dossier créé par leur enseignant.
Une fois leur article terminé, les élèves soumettront celui-ci au professeur qui pourra soit l'annoter pour correction ou le publier.
Ensuite, il pourront lire et commenter ceux de leurs camarades, ou répondre aux commentaires de la veille.

View File

@ -0,0 +1,125 @@
import { createLogger } from 'winston'
import { performance, PerformanceObserver } from 'node:perf_hooks'
// import { CpuInfo, CpuUsage } from 'node:os'
import { rm, mkdir } from 'node:fs/promises'
import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
import {
toHumanReadable,
transcriberFactory,
TranscriptFile,
TranscriptFileEvaluator,
TranscriptionEngine
} from '@peertube/peertube-transcription'
const WER_TOLERANCE = 0.01
const CER_TOLERANCE = 0.001
interface TestResult {
uuid: string
WER: number
CER: number
duration: number
engine: TranscriptionEngine
model: string
// dataThroughput: number // relevant ?
// cpus: CpuInfo[] // https://nodejs.org/docs/latest-v18.x/api/os.html#oscpus
// cpuUsages: CpuUsage[] // https://nodejs.org/docs/latest-v18.x/api/process.html#processcpuusagepreviousvalue
// // os.totalmem()
// // os.freemem()
// memoryUsages: Record<number, MemoryUsage> // https://nodejs.org/docs/latest-v18.x/api/process.html#processmemoryusage
}
const benchmarkReducer = (benchmark: Record<string, Partial<TestResult>> = {}, engineName: string, testResult: Partial<TestResult>) => ({
...benchmark,
[engineName]: {
...benchmark[engineName],
...testResult
}
})
interface FormattedTestResult {
WER?: string
CER?: string
duration?: string
model?: string
}
const formatTestResult = ({ WER, CER, duration, model }: Partial<TestResult>): FormattedTestResult => ({
WER: WER ? `${WER * 100}%` : undefined,
CER: CER ? `${CER * 100}%` : undefined,
duration: duration ? toHumanReadable(duration) : undefined,
model
})
describe('Transcribers benchmark', function () {
const transcribers = [
'openai-whisper',
'whisper-ctranslate2',
'whisper-timestamped'
]
const transcriptDirectory = buildAbsoluteFixturePath('transcription/benchmark/')
const mediaFilePath = buildAbsoluteFixturePath('transcription/videos/communiquer-lors-dune-classe-transplantee.mp4')
const referenceTranscriptFile = new TranscriptFile({
path: buildAbsoluteFixturePath('transcription/transcript/reference.txt'),
language: 'fr',
format: 'txt'
})
let benchmark: Record<string, Partial<TestResult>> = {}
before(async function () {
await mkdir(transcriptDirectory, { recursive: true })
const performanceObserver = new PerformanceObserver((items) => {
items
.getEntries()
.forEach((entry) => {
const engineName = transcribers.find(transcriberName => entry.name.includes(transcriberName))
benchmark = benchmarkReducer(benchmark, engineName, {
uuid: entry.name,
duration: entry.duration
})
})
})
performanceObserver.observe({ type: 'measure' })
})
transcribers.forEach(function (transcriberName) {
it(`Run ${transcriberName} transcriber benchmark without issue`, async function () {
this.timeout(45000)
const transcriber = transcriberFactory.createFromEngineName(
transcriberName,
createLogger(),
transcriptDirectory
)
const model = { name: 'tiny' }
const transcriptFile = await transcriber.transcribe(mediaFilePath, model, 'fr', 'txt')
const evaluator = new TranscriptFileEvaluator(referenceTranscriptFile, transcriptFile)
await new Promise(resolve => setTimeout(resolve, 1))
benchmark = benchmarkReducer(benchmark, transcriberName, {
engine: transcriber.engine,
WER: await evaluator.wer(),
CER: await evaluator.cer(),
model: model.name
})
})
})
after(async function () {
console.table(
Object
.keys(benchmark)
.reduce((formattedBenchmark, engineName, currentIndex, array) => ({
...formattedBenchmark,
[engineName]: formatTestResult(benchmark[engineName])
}), {})
)
await rm(transcriptDirectory, { recursive: true, force: true })
performance.clearMarks()
})
})

View File

@ -0,0 +1,17 @@
import { transcriberFactory } from '@peertube/peertube-transcription'
describe('Transcriber factory', function () {
const transcribers = [
'openai-whisper',
'whisper-ctranslate2',
'whisper-timestamped'
]
describe('Should be able to create a transcriber for each available transcription engine', function () {
transcribers.forEach(function (transcriberName) {
it(`Should be able to create a(n) ${transcriberName} transcriber`, function () {
transcriberFactory.createFromEngineName(transcriberName)
})
})
})
})

View File

@ -0,0 +1,66 @@
/* eslint-disable @typescript-eslint/no-unused-expressions, no-new, max-len */
import { TranscriptFile, TranscriptFileEvaluator } from '@peertube/peertube-transcription'
import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
import { join } from 'path'
import { mkdir, rm } from 'node:fs/promises'
import { expect } from 'chai'
describe('Transcript File Evaluator', function () {
const transcriptDirectory = buildAbsoluteFixturePath('transcription/transcript-evaluator')
const referenceTranscriptFilepath = buildAbsoluteFixturePath('transcription/transcript/reference.txt')
before(async function () {
await mkdir(transcriptDirectory, { recursive: true })
})
it(`may not compare files in another format than txt`, async function () {
const vttReference = await TranscriptFile.write({
path: join(transcriptDirectory, 'reference.vtt'),
format: 'vtt',
content: ''
})
const vttHypothesis = await TranscriptFile.write({
path: join(transcriptDirectory, 'hypothesis.vtt'),
format: 'vtt',
content: ''
})
expect(() => new TranscriptFileEvaluator(vttReference, vttHypothesis)).to.throw('Can only evaluate txt transcript file')
})
it(`evaluation must return coherent wer & cer`, async function () {
const reference = new TranscriptFile({
path: referenceTranscriptFilepath,
language: 'fr',
format: 'txt'
})
const hypothesis = await TranscriptFile.write({
path: join(transcriptDirectory, 'openai.txt'),
content: `Communiquez lors d'une classe transplante. Utilisez les photos prises lors de cette classe pour raconter quotidiennement le séjour vécu.
C'est le scénario P-Dagujic présenté par monsieur Navoli, professeur ainsi que le 3 sur une école alimentaire de Montpellier.
La première application a utilisé ce ralame déatec. L'enseignant va alors transférer les différentes photos réalisés lors de la classe transplante.
Dans un dossier, spécifique pour que les élèves puissent le retrouver plus facilement. Il téléverse donc ses photos dans le dossier, dans le venté, dans la médiatèque de la classe.
Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser.
Les élèves par la suite utilisera le blog. A partir de leurs nantes, il pourront se loi de parposte rédigeant un article d'un reinté.
Ils illustront ses articles à l'aide des photos de que mon numérique mise à n'accélier dans le venté.
Pour se faire, il pourront utiliser les diteurs avancés qui les renvèrent directement dans la médiatèque de la classe où il pourront retrouver le dossier créé par leurs enseignants.
Une fois leur article terminée, les élèves soumétront se lui-ci au professeur qui pourra soit la noté pour correction ou le public.
Ensuite, il pourront lire et commenter ce de leurs camarades ou répondre aux commentaires de la veille.
`,
format: 'txt',
language: 'fr'
})
const evaluator = new TranscriptFileEvaluator(reference, hypothesis)
const wer = await evaluator.wer()
expect(wer).to.be.below(1)
expect(wer).to.be.greaterThan(0.3)
const cer = await evaluator.cer()
expect(cer).to.be.below(0.1)
expect(cer).to.be.greaterThan(0.09)
console.log(await evaluator.alignement())
})
after(async function () {
await rm(transcriptDirectory, { recursive: true, force: true })
})
})

View File

@ -0,0 +1,26 @@
/* eslint-disable @typescript-eslint/no-unused-expressions */
import { expect } from 'chai'
import { mkdir } from 'node:fs/promises'
import { TranscriptFile } from '@peertube/peertube-transcription'
import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
describe('Transcript File', function () {
before(async function () {
await mkdir(buildAbsoluteFixturePath('transcription/transcript/'), { recursive: true })
})
it(`may creates a new transcript file from scratch`, async function () {
const transcript1 = await TranscriptFile.write({
path: buildAbsoluteFixturePath('transcription/transcript/test1.txt'),
content: 'test2',
format: 'txt'
})
const transcript2 = await TranscriptFile.write({
path: buildAbsoluteFixturePath('transcription/transcript/test2.txt'),
content: 'test2',
format: 'txt'
})
expect(await transcript1.equals(transcript2)).to.be.true
})
})

View File

@ -0,0 +1,140 @@
/* eslint-disable @typescript-eslint/no-unused-expressions, max-len */
import { expect, config } from 'chai'
import { createLogger } from 'winston'
import { join } from 'path'
import { mkdir, rm } from 'node:fs/promises'
import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils'
import { OpenaiTranscriber, TranscriptFile } from '@peertube/peertube-transcription'
config.truncateThreshold = 0
describe('Open AI Whisper transcriber', function () {
const transcriptDirectory = join(root(), 'test-transcript')
const shortVideoPath = buildAbsoluteFixturePath('video_short.mp4')
const frVideoPath = buildAbsoluteFixturePath('transcription/videos/communiquer-lors-dune-classe-transplantee.mp4')
const transcriber = new OpenaiTranscriber(
{
name: 'openai-whisper',
requirements: [],
type: 'binary',
binary: 'whisper',
supportedModelFormats: [ 'PyTorch' ]
},
createLogger(),
transcriptDirectory
)
before(async function () {
await mkdir(transcriptDirectory, { recursive: true })
})
it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () {
const transcript = await transcriber.transcribe(shortVideoPath)
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'video_short.vtt'),
language: 'en',
format: 'vtt'
}))).to.be.true
expect(await transcript.read()).to.equals(
`WEBVTT
00:00.000 --> 00:02.000
You
`
)
})
it('May produce a transcript file in the `srt` format', async function () {
const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'srt')
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'video_short.srt'),
language: 'en',
format: 'srt'
}))).to.be.true
expect(await transcript.read()).to.equal(
`1
00:00:00,000 --> 00:00:02,000
You
`
)
})
it('May produce a transcript file in the `txt` format', async function () {
const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'txt')
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'video_short.txt'),
language: 'en',
format: 'txt'
}))).to.be.true
expect(await transcript.read()).to.equal(`You
`)
})
it('May transcribe a media file using a local PyTorch model', async function () {
await transcriber.transcribe(frVideoPath, { name: 'myLocalModel', path: buildAbsoluteFixturePath('transcription/models/tiny.pt') }, 'fr')
})
it('May transcribe a media file in french', async function () {
this.timeout(45000)
const transcript = await transcriber.transcribe(frVideoPath, { name: 'tiny' }, 'fr', 'txt')
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'communiquer-lors-dune-classe-transplantee.txt'),
language: 'fr',
format: 'txt'
})))
expect(await transcript.read()).to.equal(
`Communiquez lors d'une classe transplante. Utilisez les photos prises lors de cette classe pour raconter quotidiennement le séjour vécu.
C'est le scénario P-Dagujic présenté par monsieur Navoli, professeur ainsi que le 3 sur une école alimentaire de Montpellier.
La première application a utilisé ce ralame déatec. L'enseignant va alors transférer les différentes photos réalisés lors de la classe transplante.
Dans un dossier, spécifique pour que les élèves puissent le retrouver plus facilement. Il téléverse donc ses photos dans le dossier, dans le venté, dans la médiatèque de la classe.
Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser.
Les élèves par la suite utilisera le blog. A partir de leurs nantes, il pourront se loi de parposte rédigeant un article d'un reinté.
Ils illustront ses articles à l'aide des photos de que mon numérique mise à n'accélier dans le venté.
Pour se faire, il pourront utiliser les diteurs avancés qui les renvèrent directement dans la médiatèque de la classe où il pourront retrouver le dossier créé par leurs enseignants.
Une fois leur article terminée, les élèves soumétront se lui-ci au professeur qui pourra soit la noté pour correction ou le public.
Ensuite, il pourront lire et commenter ce de leurs camarades ou répondre aux commentaires de la veille.
`
)
})
it('May transcribe a media file in french with small model', async function () {
this.timeout(400000)
const transcript = await transcriber.transcribe(frVideoPath, { name: 'small' }, 'fr', 'txt')
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'communiquer-lors-dune-classe-transplantee.txt'),
language: 'fr',
format: 'txt'
}))).to.be.true
expect(await transcript.read()).to.equal(
`Communiquer lors d'une classe transplantée. Utiliser les photos prises lors de cette classe
pour raconter quotidiennement le séjour vécu. C'est le scénario pédagogique présenté
par M. Navoli, professeur en cycle 3 sur une école élémentaire de Montpellier.
La première application à utiliser sera la médiathèque. L'enseignant va alors transférer
les différentes photos réalisées lors de la classe transplantée dans un dossier spécifique
pour que les élèves puissent le retrouver plus facilement. Ils téléversent donc ces
photos dans le dossier, dans le NT, dans la médiathèque de la classe. Pour terminer,
ils s'assurent que le dossier soit bien ouvert aux utilisateurs afin que tout le monde
puisse l'utiliser. Les élèves, par la suite, utiliseront le blog. A partir de leur note,
ils pourront, seul ou à deux par postes, rédiger un article dans leur NT. Ils illustreront
ces articles à l'aide des photos et documents numériques mis en accès libre dans le NT.
Pour ce faire, ils pourront utiliser l'éditeur avancé qui les renverra directement dans
la médiathèque de la classe où ils pourront retrouver le dossier créé par leur enseignant.
Une fois leur article terminé, les élèves soulèteront celui-ci au professeur qui pourra
soit la noter pour correction ou le publier. Ensuite, ils pourront lire et commenter ceux
de leur camarade, ou répondre au commentaire de la veille.
`
)
})
after(async function () {
await rm(transcriptDirectory, { recursive: true, force: true })
})
})

View File

@ -0,0 +1,148 @@
/* eslint-disable @typescript-eslint/no-unused-expressions, max-len */
import { expect, config } from 'chai'
import { createLogger } from 'winston'
import { join } from 'path'
import { mkdir, rm } from 'node:fs/promises'
import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils'
import { OpenaiTranscriber, WhisperTimestampedTranscriber, TranscriptFile } from '@peertube/peertube-transcription'
config.truncateThreshold = 0
describe('Linto timestamped Whisper transcriber', function () {
const transcriptDirectory = join(root(), 'test-transcript')
const shortVideoPath = buildAbsoluteFixturePath('video_short.mp4')
const frVideoPath = buildAbsoluteFixturePath('transcription/videos/communiquer-lors-dune-classe-transplantee.mp4')
const transcriber = new WhisperTimestampedTranscriber(
{
name: 'whisper-timestamped',
requirements: [],
type: 'binary',
binary: 'whisper_timestamped',
supportedModelFormats: [ 'PyTorch' ]
},
createLogger(),
transcriptDirectory
)
before(async function () {
await mkdir(transcriptDirectory, { recursive: true })
})
it('Should transcribe a media file and produce a transcript file in `vtt` with a ms precision', async function () {
const transcript = await transcriber.transcribe(
shortVideoPath,
{ name: 'tiny' },
'fr'
)
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'video_short.vtt'),
language: 'fr',
format: 'vtt'
}))).to.be.true
expect(await transcript.read()).to.equals(
`WEBVTT
00:02.480 --> 00:02.500
you
`
)
})
it('May produce a transcript file in the `srt` format with a ms precision', async function () {
const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'srt')
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'video_short.srt'),
language: 'en',
format: 'srt'
}))).to.be.true
expect(await transcript.read()).to.equals(
`1
00:00:02,480 --> 00:00:02,500
you
`
)
})
it('May produce a transcript file in `txt` format', async function () {
const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'txt')
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'video_short.txt'),
language: 'en',
format: 'txt'
}))).to.be.true
expect(await transcript.read()).to.equals(`you
`)
})
it('May transcribe a media file using a local PyTorch model file', async function () {
await transcriber.transcribe(frVideoPath, { name: 'myLocalModel', path: buildAbsoluteFixturePath('transcription/models/tiny.pt') }, 'fr')
})
it('May transcribe a media file in french', async function () {
this.timeout(45000)
const transcript = await transcriber.transcribe(frVideoPath, { name: 'tiny' }, 'fr', 'txt')
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'communiquer-lors-dune-classe-transplantee.txt'),
language: 'fr',
format: 'txt'
}))).to.be.true
expect(await transcript.read()).to.equal(
`...
Communiquez lors du ne class et transplanté.
Utilisez les photos prises lors de cette classe pour raconter quotidiennement le seuil jour vécu.
C'est le scénario P.D. à Goujit présenté par M.I.N.A.Voli,
professeur en cycle 3 sur une école émenteur de Montpellier.
La première application a utilisé ce ralame de Yatek.
L'enseignant va alors transférer les différentes photos réalisés lors de la classe transplantée dans un dossier,
spécifique pour que les élèves puissent le retrouver plus facilement.
Il t'éleverce donc ses photos dans le dossier, dans le venté, dans la médiatèque de la classe.
Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser.
Les élèves par la suite utiliseront le blog.
À partir de leur note, il pourront se loi de par poste rédigène article dans le reinté.
Ils illustront ses articles à l'aide des photos de commun numérique mise à n'accélier dans la même thé.
Pour se faire, il pourront utiliser les dites ravences qui les renvèrent directement dans la médiatèque de la classe,
où ils pourront retrouver le dossier créé par leur enseignon.
Une fois leur article terminée, les élèves soumétront se lui-ci au professeur,
qui pourra soit la noter pour correction ou le public.
Ensuite, il pourront lire et commenter ce de leur camarade, ou répondre au commentaire de la veille.
`
)
})
it('Should produce the same transcript text as openai-whisper given the same parameters', async function () {
const transcribeParameters: Parameters<typeof transcriber.transcribe> = [
shortVideoPath,
{ name: 'tiny' },
'en',
'txt'
]
const transcript = await transcriber.transcribe(...transcribeParameters)
const openaiTranscriber = new OpenaiTranscriber(
{
name: 'openai-whisper',
requirements: [],
type: 'binary',
binary: 'whisper',
supportedModelFormats: [ 'PyTorch' ]
},
createLogger(),
join(transcriptDirectory, 'openai-whisper')
)
const openaiTranscript = await openaiTranscriber.transcribe(...transcribeParameters)
expect(await transcript.read()).to.equals(await openaiTranscript.read())
})
after(async function () {
await rm(transcriptDirectory, { recursive: true, force: true })
})
})

View File

@ -0,0 +1,137 @@
/* eslint-disable @typescript-eslint/no-unused-expressions, max-len */
import { expect, config } from 'chai'
import { createLogger } from 'winston'
import { join } from 'path'
import { mkdir, readFile, rm } from 'node:fs/promises'
import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils'
import { Ctranslate2Transcriber, OpenaiTranscriber, TranscriptFile } from '@peertube/peertube-transcription'
config.truncateThreshold = 0
describe('Whisper CTranslate2 transcriber', function () {
const transcriptDirectory = join(root(), 'test-transcript')
const shortVideoPath = buildAbsoluteFixturePath('video_short.mp4')
const frVideoPath = buildAbsoluteFixturePath('transcription/videos/communiquer-lors-dune-classe-transplantee.mp4')
const transcriber = new Ctranslate2Transcriber(
{
name: 'anyNameShouldBeFineReally',
requirements: [],
type: 'binary',
binary: 'whisper-ctranslate2',
supportedModelFormats: []
},
createLogger(),
transcriptDirectory
)
before(async function () {
await mkdir(transcriptDirectory, { recursive: true })
})
it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () {
const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' })
expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'video_short.vtt') }))).to.be.true
expect(await readFile(transcript.path, 'utf8')).to.equal(
`WEBVTT
00:00.000 --> 00:02.000
You
`
)
})
it('May produce a transcript file in the `srt` format', async function () {
const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'srt')
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'video_short.srt'),
format: 'srt'
}))).to.be.true
expect(await readFile(transcript.path, 'utf8')).to.equal(
`1
00:00:00,000 --> 00:00:02,000
You
`
)
})
it('May produce a transcript file in the `txt` format', async function () {
const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'txt')
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'video_short.txt'),
format: 'txt'
}))).to.be.true
expect(await transcript.read()).to.equal(`You
`)
})
it('May transcribe a media file using a local CTranslate2 model', async function () {
const transcript = await transcriber.transcribe(
shortVideoPath,
{ name: 'myLocalModel', path: buildAbsoluteFixturePath('transcription/models/faster-whisper-tiny') },
'en',
'txt'
)
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'video_short.txt'),
format: 'txt'
}))).to.be.true
expect(await transcript.read()).to.equal(`You
`)
})
it('May transcribe a media file in french', async function () {
this.timeout(45000)
const transcript = await transcriber.transcribe(frVideoPath, { name: 'tiny' }, 'fr', 'txt')
expect(await transcript.equals(new TranscriptFile({
path: join(transcriptDirectory, 'communiquer-lors-dune-classe-transplantee.txt'),
language: 'fr',
format: 'txt'
}))).to.be.true
expect(await transcript.read()).to.equal(
`Communiquez lors d'une classe transplante. Utilisez les photos prises lors de cette classe pour raconter quotidiennement le séjour vécu.
C'est le scénario P.Dagujic présenté par Monsieur Navoli, professeur ainsi que le 3 sur une école alimentaire de Montpellier.
La première application utilisée sera la médiatique. L'enseignant va alors transférer les différentes photos réalisés lors de la classe transplante.
Dans un dossier, spécifique pour que les élèves puissent le retrouver plus facilement. Il téléverse donc ses photos dans le dossier, dans le venté, dans la médiatique de la classe.
Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser.
Les élèves par la suite utiliseront le blog, à partir de leur nante, il pourront se loi de parposte rédigeant un article d'un orienté.
Ils illustront ces articles à l'aide des photos de commun numériques mises un accès libre dans leaineté. Pour se faire, il pourront utiliser les détecteurs avancés qui des renvers un directement dans la médiatique de la classe, où il pourront retrouver le dossier créé par leur enseignant.
Une fois leur article terminée, les élèves soumettront celui-ci au professeur qui pourra soit la noté pour correction ou le public.
Ensuite, il pourront lire et commenter ce de leur camarade, on répondra au commentaire de la veille.
`
)
})
it('Should produce the same transcript text as openai-whisper given the same parameters', async function () {
const transcribeArguments: Parameters<typeof transcriber.transcribe> = [
shortVideoPath,
{ name: 'tiny' },
'en',
'txt'
]
const transcript = await transcriber.transcribe(...transcribeArguments)
const openaiTranscriber = new OpenaiTranscriber(
{
name: 'openai-whisper',
requirements: [],
type: 'binary',
binary: 'whisper',
supportedModelFormats: [ 'PyTorch' ]
},
createLogger(),
join(transcriptDirectory, 'openai-whisper')
)
const openaiTranscript = await openaiTranscriber.transcribe(...transcribeArguments)
expect(await transcript.equals(openaiTranscript))
})
after(async function () {
await rm(transcriptDirectory, { recursive: true, force: true })
})
})

View File

@ -6,7 +6,8 @@
"tsBuildInfoFile": "./dist/.tsbuildinfo",
"paths": {
"@tests/*": [ "./src/*" ],
"@server/*": [ "../../server/core/*" ]
"@server/*": [ "../../server/core/*" ],
"@peertube/peertube-transcription": [ "../transcription" ]
}
},
"references": [
@ -16,6 +17,7 @@
{ "path": "../node-utils" },
{ "path": "../typescript-utils" },
{ "path": "../server-commands" },
{ "path": "../transcription" },
{ "path": "../../server/tsconfig.lib.json" }
],
"include": [

View File

@ -0,0 +1,92 @@
DeepLearningFramework vs training libraries
https://github.com/openai/whisper/blob/main/whisper/__init__.py#L144
```typescript
interface DeepLearningFramework {
name: string
}
const deepLearningFrameworks: DeepLearningFramework = [
{
name: 'PyTorch',
distributed: true,
gpu: true
},
{
name: 'TensorFlow'
}
]
```
What about the lifecycle of each transcriber ?
- install => installer
- update => udpater
For the **Python** packages :
1. Install
```sh
pip install <package-name>
```
Package version should be constraint to a version compatible with our wrapper.
We could also attempt to run our test against different version of the lib to be future ready.
2. Update
```sh
pip install -U <package-name>
```
> Need the package name somewhere in the model
>
>
### Whisper timestamped discrepancies
- Lower case instead of upper case
- missing .json file
- binary name is awkard, package is name whisper-timestamped and binary name is whisper-tiomestamped
> https://github.com/linto-ai/whisper-timestamped/issues?q=is:issue+author:lutangar
## About models
Convert Whisper transformer model from PyTorch to ggml format
: e original Whisper PyTorch models provided by OpenAI a ggml format in order to be able to load them in C/C++
In supervised machine learning, the artefact created after training that is used to make predictions on new data is called a model.
models can be saved in a file that can potentially be compressed, so typically model files have a binary file format
TensorFlow saves models as protocol buffer files, with a .pb file extension.
Keras saves models natively as .h5 file.
Scikit-Learn saves models as pickled python objects, with a .pkl file extension.
An older format for model serving based on XML, predictive model markup language (.pmml), is still usable on some frameworks, such as Scikit-Learn.
Training File Formats :
- petastorm
- npy
- tfrecords
Model Serving Serialization Formats
- pb
- mlmodel
onnx
pkl
older : h5 pmml
Hugging Face fine-tuned models to ggml format
or Whisper transformer model ?
ML models vs Transformer Model
Transcription Model
Other model file formats that are used include SparkML models that can be saved in MLeap file format and served in real-time using a MLleap model server (files are packaged in .zip format). Apple developed the .mlmodel file format to store models embedded in iOS applications as part of its Core ML framework (which has superior support for ObjectiveC and Swift languages). Applications trained in TensorFlow, Scikit-Learn, and other frameworks need to convert their model files to the .mlmodel file format for use on iOS, with tools like, coremltools and Tensorflow converter being available to help file format conversion. ONNX is a ML framework independent file format, supported by Microsoft, Facebook, and Amazon. In theory, any ML framework should be able to export its models in .onnx file format, so it offers great promise in unifying model serving across the different frameworks. However, as of late 2019, ONNX does not support all operations for the most popular ML frameworks (TensorFlow, PyTorch, Scikit-Learn), so ONNX is not yet practical for those frameworks. In PyTorch, the recommended way to serve models is to use Torch Script to trace and save a model as a .pt file and serve it from a C++ application.
One final file format to mention here is YAML that is used to package models as part of the MLFlow framework for ML pipelines on Spark. MLFlow stores a YAML file that describes the files it packages for model serving, so that deployment tools can understand the model file format and know what files to deploy.
// ModelServingFileSerializationFormats
File formats: .pb, .onnx, .pkl, .mlmodel, .zip, .pmml, .pt
Inference: .pb files are served by TensorFlowServing Server;
.onnx files are served by Microsofts commercial model serving platorm;
.pkl files are served for Scikit-Learn models, often on Flask servers;
.mlmodel files are served by iOS platforms;
.zip files are used to package up MLeap files that are served on the MLeap runtime;
.pt files are use to package PyTorch models that can be served inside C++ applications.
.'PyTorch' | 'GGML' | 'ONNX' // CoreML, OpenVino, Scikit-Learn, TensorFlow/Keras, PySpark
https://towardsdatascience.com/guide-to-file-formats-for-machine-learning-columnar-training-inferencing-and-the-feature-store-2e0c3d18d4f9

View File

@ -0,0 +1,19 @@
{
"name": "@peertube/peertube-transcription",
"private": true,
"version": "0.0.0",
"main": "dist/index.js",
"files": [ "dist" ],
"exports": {
"types": "./dist/index.d.ts",
"peertube:tsx": "./src/index.ts",
"default": "./dist/index.js"
},
"type": "module",
"devDependencies": {},
"scripts": {
"build": "tsc",
"watch": "tsc -w"
},
"dependencies": {}
}

View File

@ -0,0 +1,81 @@
import { join } from 'node:path'
import { existsSync } from 'node:fs'
import { PerformanceObserver } from 'node:perf_hooks'
import assert from 'node:assert'
import { createLogger, Logger } from 'winston'
import short from 'short-uuid'
import { root } from '@peertube/peertube-node-utils'
import { TranscriptionEngine } from './transcription-engine.js'
import { TranscriptionModel } from './transcription-model.js'
import { TranscriptFile, TranscriptFormat } from './transcript/index.js'
export abstract class AbstractTranscriber {
public static DEFAULT_TRANSCRIPT_DIRECTORY = join(root(), 'dist', 'transcripts')
engine: TranscriptionEngine
logger: Logger
transcriptDirectory: string
performanceObserver?: PerformanceObserver
runId?: string
constructor (
engine: TranscriptionEngine,
logger: Logger = createLogger(),
transcriptDirectory: string = AbstractTranscriber.DEFAULT_TRANSCRIPT_DIRECTORY,
performanceObserver?: PerformanceObserver
) {
this.engine = engine
this.logger = logger
this.transcriptDirectory = transcriptDirectory
this.performanceObserver = performanceObserver
}
detectLanguage () {
return Promise.resolve('')
}
loadModel (model: TranscriptionModel) {
if (existsSync(model.path)) { /* empty */ }
}
supports (model: TranscriptionModel) {
return model.format === 'PyTorch'
}
createPerformanceMark () {
this.runId = `${short.uuid()}-${this.engine.name}`
performance.mark(this.getStartPerformanceMarkName())
}
measurePerformanceMark () {
try {
performance.mark(this.getEndPerformanceMarkName())
performance.measure(
this.runId,
this.getStartPerformanceMarkName(),
this.getEndPerformanceMarkName()
)
} catch (e) {
this.logger.log({ level: 'error', message: e })
}
}
getStartPerformanceMarkName () {
assert(!!this.runId, 'Each transcription run should have an id.')
return `${this.runId}-started`
}
getEndPerformanceMarkName () {
assert(!!this.runId, 'Each transcription run should have an id.')
return `${this.runId}-ended`
}
abstract transcribe (
mediaFilePath: string,
model: TranscriptionModel,
language: string,
format: TranscriptFormat
): Promise<TranscriptFile>
}

View File

@ -0,0 +1,23 @@
import { expect } from 'chai'
import { toHumanReadable, toTimecode } from './duration.js'
describe('duration conversion functions', () => {
it('toHumanReadable', () => {
const ONE_MINUTE = 60000
let humanDuration = toHumanReadable(ONE_MINUTE)
expect(humanDuration).to.equal('1m')
humanDuration = toHumanReadable(ONE_MINUTE * 60 + ONE_MINUTE)
expect(humanDuration).to.equal('1h 1m')
})
it('toTimecode', () => {
const MORE_OR_LESS_ONE_MINUTE = '60.41545'
let timecode = toTimecode(MORE_OR_LESS_ONE_MINUTE)
expect(timecode).to.equal('00:01:00')
const ONE_HOUR = '3600'
timecode = toTimecode(ONE_HOUR)
expect(timecode).to.equal('01:00:00')
})
})

View File

@ -0,0 +1,35 @@
export interface DurationDescriptor {
duration: number
unit: string
}
export function toHumanReadable (ms: number) {
const date = new Date(ms)
const durationDescriptors: DurationDescriptor[] = [
{ duration: date.getUTCHours(), unit: 'h' },
{ duration: date.getUTCMinutes(), unit: 'm' },
{ duration: date.getUTCSeconds(), unit: 's' }
]
return durationDescriptors
.map(toWords)
.filter((words) => words)
.join(' ')
}
export function toWords ({ duration, unit }: DurationDescriptor) {
return duration > 0 ? `${duration}${unit}` : ''
}
export function toTimecode (s: number | string) {
const date = new Date(0, 0, 0, 0, 0, parseFloat(s.toString()), 0)
const hours = date.getHours()
const minutes = date.getMinutes()
const seconds = date.getSeconds()
return `${padLeft(hours)}:${padLeft(minutes)}:${padLeft(seconds)}`
}
function padLeft (value: number, length = 2): string {
return value.toString().padStart(length, '0')
}

View File

@ -0,0 +1,13 @@
import { basename, extname } from 'path'
export const getFileInfo = (path: string) => {
const extension = extname(path)
const baseName = basename(path, extension)
const name = `${baseName}${extension}`
return ({
extension,
baseName,
name
})
}

View File

@ -0,0 +1,11 @@
import { TranscriberFactory } from './transcriber-factory.js'
import { engines } from './whisper/index.js'
export * from './duration.js'
export * from './transcript/index.js'
export * from './transcription-engine.js'
export * from './transcription-model.js'
export * from './whisper/index.js'
export const transcriberFactory = new TranscriberFactory(engines)

View File

@ -0,0 +1,49 @@
import { Logger, createLogger } from 'winston'
import { TranscriptionEngine } from './transcription-engine.js'
import {
Ctranslate2Transcriber,
OpenaiTranscriber, WhisperTimestampedTranscriber
} from './whisper/index.js'
import { AbstractTranscriber } from './abstract-transcriber.js'
export class TranscriberFactory {
engines: TranscriptionEngine[]
constructor (engines: TranscriptionEngine[]) {
this.engines = engines
}
createFromEngineName (
engineName: string,
logger: Logger = createLogger(),
transcriptDirectory: string = AbstractTranscriber.DEFAULT_TRANSCRIPT_DIRECTORY
) {
const engine = this.getEngineByName(engineName)
const transcriberArgs: ConstructorParameters<typeof AbstractTranscriber> = [
engine,
logger,
transcriptDirectory
]
switch (engineName) {
case 'openai-whisper':
return new OpenaiTranscriber(...transcriberArgs)
case 'whisper-ctranslate2':
return new Ctranslate2Transcriber(...transcriberArgs)
case 'whisper-timestamped':
return new WhisperTimestampedTranscriber(...transcriberArgs)
default:
throw new Error(`Unimplemented engine ${engineName}`)
}
}
getEngineByName (engineName: string) {
const engine = this.engines.find(({ name }) => name === engineName)
if (!engine) {
throw new Error(`Unknow engine ${engineName}`)
}
return engine
}
}

View File

@ -0,0 +1,3 @@
export * from './transcript-file.js'
export * from './transcript-file-evaluator.js'
export * from './transcript-file-interface.js'

View File

@ -0,0 +1,75 @@
import { $ } from 'execa'
import assert from 'node:assert'
import { TranscriptFile } from './index.js'
/**
* This transcript evaluator is based on Jiwer CLI, a Python implementation :
* https://jitsi.github.io/jiwer/cli/
*
* There are plenty implementation of WER (Word Error Rate) and CER (Character Error Rate) calculation in Python
* but not that many in NodeJs.
*/
export class TranscriptFileEvaluator {
referenceTranscriptFile: TranscriptFile
hypothesisTranscriptFile: TranscriptFile
constructor (referenceTranscriptFile: TranscriptFile, hypothesisTranscriptFile: TranscriptFile) {
assert(referenceTranscriptFile.format === 'txt', 'Can only evaluate txt transcript file')
assert(hypothesisTranscriptFile.format === 'txt', 'Can only evaluate txt transcript file')
this.referenceTranscriptFile = referenceTranscriptFile
this.hypothesisTranscriptFile = hypothesisTranscriptFile
}
static buildArgs (referenceTranscriptFilepath: string, hypothesisTranscriptFilepath: string, ...args: string[]) {
return [
'--reference',
referenceTranscriptFilepath,
'--hypothesis',
hypothesisTranscriptFilepath,
...args
]
}
buildArgs (...args: string[]) {
return TranscriptFileEvaluator.buildArgs(this.referenceTranscriptFile.path, this.hypothesisTranscriptFile.path, ...args)
}
/**
* WER: Word Error Rate
*/
async wer () {
const { stdout: wer } = await $`jiwer ${this.buildArgs('-g')}`
return Number(wer)
}
/**
* CER: Character Error Rate
*/
async cer () {
// @see https://github.com/jitsi/jiwer/issues/87
let result = {
stdout: undefined
}
try {
result = await $`jiwer ${this.buildArgs('--cer')}`
} catch {}
return result.stdout ? Number(result.stdout) : undefined
}
async alignement () {
const { stdout: alignement } = await $`jiwer ${this.buildArgs('--align')}`
return alignement
}
async evaluate () {
return {
wer: await this.wer(),
cer: await this.cer(),
alignement: await this.alignement()
}
}
}

View File

@ -0,0 +1,3 @@
export type TranscriptFormat = 'txt' | 'vtt' | 'srt'
export type TranscriptFileInterface = { path: string, language?: string, format: TranscriptFormat }

View File

@ -0,0 +1,50 @@
import { statSync } from 'node:fs'
import { readFile, writeFile } from 'node:fs/promises'
import { TranscriptFileInterface, TranscriptFormat } from './transcript-file-interface.js'
import { TranscriptFileEvaluator } from './transcript-file-evaluator.js'
export class TranscriptFile implements TranscriptFileInterface {
path: string
language: string = 'en'
format: TranscriptFormat = 'vtt'
constructor ({ path, language = 'en', format = 'vtt' }: { path: string, language?: string, format?: TranscriptFormat }) {
statSync(path)
this.path = path
this.language = language
this.format = format
}
/**
* Asynchronously reads the entire contents of a transcript file.
* @see https://nodejs.org/docs/latest-v18.x/api/fs.html#filehandlereadfileoptions for options
*/
async read (options: Parameters<typeof readFile>[1] = 'utf8') {
return await readFile(this.path, options)
}
/**
* Write a transcript file to disk.
*/
static async write ({
path,
content,
language = 'en',
format = 'vtt'
}: { path: string, content: string, language?: string, format?: TranscriptFormat }): Promise<TranscriptFile> {
await writeFile(path, content)
return new TranscriptFile({ path, language, format })
}
async equals (transcript: TranscriptFile) {
return await transcript.read() === await this.read()
}
async evaluate (transcript: TranscriptFile) {
const evaluator = new TranscriptFileEvaluator(this, transcript)
return evaluator.evaluate()
}
}

View File

@ -0,0 +1,22 @@
import { ModelFormat } from './transcription-model.js'
/**
* The engine, or framework.
*/
export class TranscriptionEngine {
name: string
description?: string
language?: string
requirements: string[]
type: 'binary' | 'bindings' | 'ws'
binary: string
license?: string
forgeURL?: string
supportedModelFormats: ModelFormat[]
// There could be a default models.
// There could be a list of default models
constructor (parameters: TranscriptionEngine) {
Object.assign(this, parameters)
}
}

View File

@ -0,0 +1,19 @@
export type ModelFormat = 'PyTorch' | 'GGML' | 'ONNX' | 'CTranslate2' // CoreML, OpenVino, Scikit-Learn, TensorFlow/Keras, PySpark
export abstract class TranscriptionModel {
name: string
format?: ModelFormat
path?: string
url?: string
// # - hparams
// # - Number of dimensions (int)
// # - Name length (int)
// # - Dimensions (int[n_dims])
// # - Name (char[name_length])
// # - Data (float[n_dims])
// # - mel filters
// # - tokenizer vocab
// # - model variables
}

View File

@ -0,0 +1,81 @@
- cpp
- ctranslate2
- faster-whisper
- insanely-fast-whisper
- whisper
- transformers.js
- whisperX
Transformers* could be defined as an all-purpose inference engines instead of a whisper only engine :
- to create a video summary
-
// mixed precision training
// env.cacheDir = './.cache';
// env.localModelPath = '/path/to/models/';
// env.allowRemoteModels = false;
// To optimize the data pipeline, you should use techniques such as
// caching,
// prefetching,
// batching,
// sharding, and
// compression, depending on the characteristics and size of your data.
// You should also monitor the data throughput and utilization of the GPU and CPU devices, and adjust the data pipeline accordingly.
// 1) Prefetching: To load data asynchronously while the model is training on the current batch. This minimizes data loading bottlenecks.
// 2) Data Sampling for initial models: For initial model development or debugging, working with a smaller subset of your data to can help speedy setup and output.
// 3) Parallel Processing: This is the most obvious point and important point. Utilize multi-threading or multiprocessing libraries like concurrent.futures in Python to preprocess data in parallel. This is particularly effective when dealing with large datasets.
// https://www.linkedin.com/advice/3/how-can-you-optimize-machine-learning-models
// Use mixed precision training
// Apply model pruning and quantization
// Sizing the model will almost always help with performance,
// On GPUs,
// - leverage batch processing
// - and mixed-precision training,
// - manage GPU memory,
// - and consider model pruning.
// For CPUs,
// - utilize multi-threading,
// - efficient libraries,
// - batch inference, quantization,
// - and model optimization.
// - Employ
// - compiler flags,
// - caching,
// - and distributed computing for CPU performance.
// Profiling tools help identify bottlenecks on both hardware types, ensuring efficient model deployment in diverse environments.
// The choice between GPU and CPU optimization depends on the specific task and hardware resources available.
// Cela pourrait être chouette de pouvoir run des tests sur des runners gpu depuis Github Actions :
// https://resources.github.com/devops/accelerate-your-cicd-with-arm-and-gpu-runners-in-github-actions/
// Techniques such as
// model quantization, pruning,
// and other optimizations can further enhance the efficiency of running these models on CPU hardware.
// If you're looking to deploy Whisper models on CPU-based systems, you can use popular deep learning frameworks like TensorFlow or PyTorch, which provide support for deploying models on CPU and offer optimizations for inference performance. Additionally, platforms like ONNX Runtime or TensorFlow Lite offer optimizations for inference on CPU, including support for quantized models and hardware acceleration where available.
// https://eval.ai/web/challenges/challenge-page/1637/overview
// https://github.com/fquirin/speech-recognition-experiments
// => are producting models
// PyTorch and TensorFlow
// deepLearningFramework
// cpp.ts
// ctranslate2.ts
// faster.ts
// insanely-fast.ts
// python.ts
// transformer.ts
// X .ts
// whisper.cpp
// ggml

View File

@ -0,0 +1,58 @@
import { TranscriptionEngine } from '../transcription-engine.js'
export const engines: TranscriptionEngine[] = [
{
name : 'whisper-cpp',
description : 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model',
type: 'binary',
binary: 'main',
language : 'cpp',
requirements : [],
forgeURL : 'https://github.com/ggerganov/whisper.cpp',
license : 'MIT',
supportedModelFormats: [ 'ONNX' ]
},
// {
// name : 'transformers',
// description : 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model',
// type: 'binary',
// language : 'python',
// requirements : [],
// forgeURL : '',
// license : '',
// supportedModelFormats: [ 'ONNX' ]
// },
{
name: 'openai-whisper',
description: 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model',
requirements: [ 'python', 'pyTorch', 'ffmpeg' ],
language: 'python',
type: 'binary',
binary: 'whisper',
forgeURL: 'https://github.com/openai/whisper',
license: 'MIT',
supportedModelFormats: [ 'PyTorch' ]
},
{
name: 'whisper-ctranslate2',
description: '',
requirements: [ 'python' ],
language: 'python',
type: 'binary',
binary: 'whisper-ctranslate2',
forgeURL: 'https://github.com/openai/whisper',
license: 'MIT',
supportedModelFormats: [ 'CTranslate2' ]
},
{
name: 'whisper-timestamped',
description: '',
requirements: [ 'python' ],
language: 'python',
type: 'binary',
binary: 'whisper_timestamped',
forgeURL: 'https://github.com/openai/whisper',
license: 'MIT',
supportedModelFormats: [ 'CTranslate2' ]
}
]

View File

@ -0,0 +1,2 @@
export * from './transcriber/index.js'
export * from './engines.js'

View File

@ -0,0 +1,48 @@
import { $ } from 'execa'
import { join } from 'path'
import { lstat } from 'node:fs/promises'
import { OpenaiTranscriber } from './openai-transcriber.js'
import { TranscriptionModel } from '../../transcription-model.js'
import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js'
import { getFileInfo } from '../../file-utils.js'
export class Ctranslate2Transcriber extends OpenaiTranscriber {
public static readonly MODEL_FILENAME = 'model.bin'
async transcribe (
mediaFilePath: string,
model: TranscriptionModel = { name: 'tiny' },
language: string = 'en',
format: TranscriptFormat = 'vtt'
): Promise<TranscriptFile> {
this.createPerformanceMark()
// Shall we run the command with `{ shell: true }` to get the same error as in sh ?
// ex: ENOENT => Command not found
const $$ = $({ verbose: true })
const { baseName } = getFileInfo(mediaFilePath)
if (model.path) {
await lstat(model.path).then(stats => stats.isDirectory())
}
const modelArgs = model.path ? [ '--model_directory', model.path ] : [ '--model', model.name ]
await $$`${this.engine.binary} ${[
mediaFilePath,
...modelArgs,
'--output_format',
format,
'--output_dir',
this.transcriptDirectory,
'--language',
language
]}`
this.measurePerformanceMark()
return new TranscriptFile({
language,
path: join(this.transcriptDirectory, `${baseName}.${format}`),
format
})
}
}

View File

@ -0,0 +1,5 @@
export * from './ctranslate2-transcriber.js'
export * from './transformers-js-transcriber.js'
export * from './transformers-transcriber.js'
export * from './openai-transcriber.js'
export * from './timestamped-transcriber.js'

View File

@ -0,0 +1,41 @@
import { join } from 'path'
import { $ } from 'execa'
import { TranscriptionModel } from '../../transcription-model.js'
import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js'
import { AbstractTranscriber } from '../../abstract-transcriber.js'
import { getFileInfo } from '../../file-utils.js'
export class OpenaiTranscriber extends AbstractTranscriber {
async transcribe (
mediaFilePath: string,
model: TranscriptionModel = { name: 'tiny' },
language: string = 'en',
format: TranscriptFormat = 'vtt'
): Promise<TranscriptFile> {
this.createPerformanceMark()
// Shall we run the command with `{ shell: true }` to get the same error as in sh ?
// ex: ENOENT => Command not found
const $$ = $({ verbose: true })
const { baseName } = getFileInfo(mediaFilePath)
await $$`${this.engine.binary} ${[
mediaFilePath,
'--model',
model?.path || model.name,
'--output_format',
format,
'--output_dir',
this.transcriptDirectory,
'--language',
language
]}`
this.measurePerformanceMark()
return new TranscriptFile({
language,
path: join(this.transcriptDirectory, `${baseName}.${format}`),
format
})
}
}

View File

@ -0,0 +1,46 @@
import { $ } from 'execa'
import assert from 'node:assert'
import { join } from 'node:path'
import { existsSync } from 'node:fs'
import { rename } from 'node:fs/promises'
import { TranscriptionModel } from '../../transcription-model.js'
import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js'
import { getFileInfo } from '../../file-utils.js'
import { OpenaiTranscriber } from './openai-transcriber.js'
export class WhisperTimestampedTranscriber extends OpenaiTranscriber {
async transcribe (
mediaFilePath: string,
model: TranscriptionModel,
language: string,
format: TranscriptFormat = 'vtt'
): Promise<TranscriptFile> {
this.createPerformanceMark()
const $$ = $({ verbose: true })
const { baseName, name } = getFileInfo(mediaFilePath)
await $$`${this.engine.binary} ${[
mediaFilePath,
'--model',
model?.path || model.name,
'--output_format',
'all',
'--output_dir',
this.transcriptDirectory
]}`
const internalTranscriptPath = join(this.transcriptDirectory, `${name}.${format}`)
const transcriptPath = join(this.transcriptDirectory, `${baseName}.${format}`)
// Whisper timestamped is supposed to output file with the video file extension ex: video.mp4.vtt
assert(existsSync(internalTranscriptPath), `${internalTranscriptPath} file doesn't exist.`)
await rename(internalTranscriptPath, transcriptPath)
this.measurePerformanceMark()
return new TranscriptFile({
language,
path: transcriptPath,
format
})
}
}

View File

@ -0,0 +1,21 @@
import { TranscriptionModel } from '../../transcription-model.js'
import { AbstractTranscriber } from '../../abstract-transcriber.js'
import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js'
// Disable local models
// env.allowLocalModels = true
export class TransformersJsTranscriber extends AbstractTranscriber {
async transcribe (
mediaFilePath: string,
model: TranscriptionModel,
language: string,
format: TranscriptFormat = 'vtt'
): Promise<TranscriptFile> {
return Promise.resolve(undefined)
// return pipeline('automatic-speech-recognition', 'no_attentions', {
// // For medium models, we need to load the `no_attentions` revision to avoid running out of memory
// revision: [].includes('/whisper-medium') ? 'no_attentions' : 'main'
// })
}
}

View File

@ -0,0 +1,43 @@
import { TranscriptionModel } from '../../transcription-model.js'
import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js'
import { AbstractTranscriber } from '../../abstract-transcriber.js'
import { $ } from 'execa'
import { join } from 'path'
export class TransformersTranscriber extends AbstractTranscriber {
async transcribe (
mediaFilePath: string,
model: TranscriptionModel,
language: string,
format: TranscriptFormat = 'vtt'
): Promise<TranscriptFile> {
const $$ = $({ verbose: true })
// const ffmpegChildProcess = $$`ffmpeg ${[
// '-i',
// mediaFilePath,
// '-vn', // no video
// '-ar',
// 16000, // set the audio sampling frequency
// '-ac',
// '1', // set the number of audio channels to 1 since Vosk is expecting mono
// '-bufsize',
// 1000, // set a buffer size to provide a steady flow of frames
// '-'
// ]}`
await $$`transformers-cli ${[
'--task',
'automatic-speech-recognition',
'--model',
'openai/whisper-tiny',
'--input',
mediaFilePath
]}`
return new TranscriptFile({
language,
path: join(this.transcriptDirectory, `test.${format}`),
format
})
}
}

View File

@ -0,0 +1,13 @@
{
"extends": "../../tsconfig.base.json",
"compilerOptions": {
"outDir": "./dist",
"rootDir": "src",
"tsBuildInfoFile": "./dist/.tsbuildinfo"
},
"references": [
{ "path": "../models" },
{ "path": "../core-utils" },
{ "path": "../node-utils" }
]
}

View File

@ -0,0 +1,10 @@
{
"extends": "./tsconfig.json",
"compilerOptions": {
"outDir": "../types-generator/dist/peertube-transcription",
"tsBuildInfoFile": "../types-generator/dist/peertube-transcription/.tsbuildinfo",
"stripInternal": true,
"removeComments": false,
"emitDeclarationOnly": true
}
}

View File

@ -14,6 +14,7 @@
{ "path": "../packages/ffmpeg" },
{ "path": "../packages/models" },
{ "path": "../packages/node-utils" },
{ "path": "../packages/transcription" },
{ "path": "../packages/typescript-utils" }
],
"include": [

View File

@ -27,6 +27,7 @@
{ "path": "./packages/models" },
{ "path": "./packages/node-utils" },
{ "path": "./packages/server-commands" },
{ "path": "./packages/transcription" },
{ "path": "./packages/typescript-utils" }
]
}