feat(transcription): groundwork

chore: fiddling around some more chore: add ctranslate2 and timestamped chore: add performance markers chore: refactor test chore: change worflow name chore: ensure Python3 chore(duration): convert to chai/mocha syntahx chore(transcription): add individual tests for others transcribers chore(transcription): implement formats test of all implementations Also compare result of other implementation to the reference implementation chore(transcription): add more test case with other language and models size and local model chore(test): wip ctranslate 2 adapat chore(transcription): wip transcript file and benchmark chore(test): clean a bit chore(test): clean a bit chore(test): refacto timestamed spec chore(test): update workflow chore(test): fix glob expansion with sh chore(test): extract some hw info chore(test): fix async tests chore(benchmark): add model info feat(transcription): allow use of a local mode in timestamped-whisper feat(transcription): extract run and profiling info in own value object feat(transcription): extract run concept in own class an run more bench chore(transcription): somplify run object only a uuid is now needed and add more benchmark scenario docs(transcription): creates own package readme docs(transcription): add local model usage docs(transcription): update README fix(transcription): use fr video for better comparison chore(transcription): make openai comparison passed docs(timestamped): clea chore(transcription): change transcribers transcribe method signature Introduce whisper builtin model. fix(transcription): activate language detection Forbid transcript creation without a language. Add `languageDetection` flag to an engine and some assertions. Fix an issue in `whisper-ctranslate2` : https://github.com/Softcatala/whisper-ctranslate2/pull/93 chore(transcription): use PeerTube time helpers instead of custom ones Update existing time function to output an integer number of seconds and add a ms human-readable time formatter with hints of tests. chore(transcription): use PeerTube UUID helpers chore(transcription): enable CER evaluation Thanks to this recent fix in Jiwer <3 https://github.com/jitsi/jiwer/issues/873 chore(jiwer): creates JiWer package I'm not very happy with the TranscriptFileEvaluator constructor... suggestions ? chore(JiWer): add usage in README docs(jiwer): update JiWer readme chore(transcription): use FunMOOC video in fixtures chore(transcription): add proper english video fixture chore(transcription): use os tmp directory where relevant chore(transcription): fix jiwer cli test reference.txt chore(transcription): move benchmark out of tests chore(transcription): remove transcription workflow docs(transcription): add benchmark info fix(transcription): use ms precision in other transcribers chore(transcription): simplify most of the tests chore(transcription): remove slashes when building path with join chore(transcription): make fromPath method async chore(transcription): assert path to model is a directory for CTranslate2 transcriber chore(transcription): ctranslate2 assertion chore(transcription): ctranslate2 assertion chore(transcription): add preinstall script for Python dependencies chore(transcription): add download and unzip utils functions chore(transcription): add download and unzip utils functions chore(transcription): download & unzip models fixtures chore(transcription): zip chore(transcription): raise download file test timeout chore(transcription): simplify download file test chore(transcription): add transcriptions test to CI chore(transcription): raise test preconditions timeout chore(transcription): run preinstall scripts before running ci chore(transcription): create dedicated tmp folder for transcriber tests chore(transcription): raise timeout some more chore(transcription): raise timeout some more chore(transcription): raise timeout some more chore(transcription): raise timeout some more chore(transcription): raise timeout some more chore(transcription): raise timeout some more chore(transcription): raise timeout some more chore(transcription): raise timeout some more chore(transcription): use short video for local model test chore(transcription): raise timeout some more chore(transcription): raise timeout some more chore(transcription): raise timeout some more chore(transcription): setup verbosity based on NODE_ENV value
2024-03-29 10:34:45 +01:00 · 2024-03-29 10:34:45 +01:00 · ef14cf4a5c
parent b10482e0e0
commit ef14cf4a5c
69 changed files with 2159 additions and 7 deletions
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@ -39,7 +39,7 @@ jobs:
    strategy:
      fail-fast: false
      matrix:
-        test_suite: [ types-package, client, api-1, api-2, api-3, api-4, api-5, cli-plugin, lint, external-plugins ]
+        test_suite: [ types-package, client, api-1, api-2, api-3, api-4, api-5, transcription, cli-plugin, lint, external-plugins ]

    env:
      PGUSER: peertube
--- a/.gitignore
+++ b/.gitignore
@ -2,6 +2,7 @@
 node_modules
 *npm-debug.log
 yarn-error.log
+*-ci.log
 .yarn

 # Testing
--- a/apps/peertube-runner/tsconfig.json
+++ b/apps/peertube-runner/tsconfig.json
@ -11,6 +11,7 @@
    { "path": "../../packages/ffmpeg" },
    { "path": "../../packages/models" },
    { "path": "../../packages/node-utils" },
-    { "path": "../../packages/server-commands" }
+    { "path": "../../packages/server-commands" },
+    { "path": "../../packages/transcription" },
  ]
 }
--- a/package.json
+++ b/package.json
@ -25,6 +25,7 @@
  ],
  "scripts": {
    "benchmark-server": "tsx --conditions=peertube:tsx ./scripts/benchmark.ts",
+    "benchmark-transcription": "tsx --conditions=peertube:tsx --tsconfig ./packages/transcription/tsconfig.json ./packages/transcription/src/benchmark.ts",
    "build:client": "bash ./scripts/build/client.sh",
    "build:embed": "bash ./scripts/build/embed.sh",
    "build:peertube-cli": "bash ./scripts/build/peertube-cli.sh",
--- a/packages/core-utils/src/common/date.ts
+++ b/packages/core-utils/src/common/date.ts
@ -125,7 +125,7 @@ function secondsToTime (options: {
  else if (minutes >= 1) time += formatNumber(minutes) + minuteSymbol
  else if (format === 'full') time += '00' + minuteSymbol

-  seconds %= 60
+  seconds = Math.round(seconds) % 60
  if (seconds >= 1 && seconds < 10 && format === 'full') time += '0' + seconds + secondsSymbol
  else if (seconds >= 1) time += formatNumber(seconds) + secondsSymbol
  else if (format === 'full') time += '00'
@ -133,6 +133,14 @@ function secondsToTime (options: {
  return time
 }

+function millisecondsToTime (options: {
+  seconds: number
+  format: 'short' | 'full' | 'locale-string' // default 'short'
+  symbol?: string
+} | number) {
+  return secondsToTime(typeof options === 'number' ? options / 1000 : { ...options, seconds: options.seconds / 1000 })
+}
+
 // ---------------------------------------------------------------------------

 export {
@ -143,7 +151,8 @@ export {
  isLastMonth,
  isLastWeek,
  timeToInt,
-  secondsToTime
+  secondsToTime,
+  millisecondsToTime
 }

 // ---------------------------------------------------------------------------
--- a/packages/jiwer/README.md
+++ b/packages/jiwer/README.md
@ -0,0 +1,37 @@
+JiWER
+=====
+__JiWER__ CLI NodeJs wrapper.
+
+> *JiWER is a python tool for computing the word-error-rate of ASR systems.*
+> https://jitsi.github.io/jiwer/cli/
+
+__JiWER__ serves as a reference implementation to calculate errors rates between 2 text files:
+- WER (Word Error Rate)
+- CER (Character Error Rate)
+
+Build
+-----
+
+```sh
+npm run build
+```
+
+Usage
+-----
+```typescript
+const jiwerCLI = new JiwerClI('./reference.txt', './hypothesis.txt')
+
+// WER as a percentage, ex: 0.03 -> 3%
+console.log(await jiwerCLI.wer())
+
+// CER as a percentage: 0.01 -> 1%
+console.log(await jiwerCLI.cer())
+
+// Detailed comparison report
+console.log(await jiwerCLI.alignment())
+```
+
+Resources
+---------
+- https://jitsi.github.io/jiwer/
+- https://github.com/rapidfuzz/RapidFuzz
--- a/packages/jiwer/package.json
+++ b/packages/jiwer/package.json
@ -0,0 +1,20 @@
+{
+  "name": "@peertube/peertube-jiwer",
+  "private": true,
+  "version": "0.0.0",
+  "main": "dist/index.js",
+  "files": [ "dist" ],
+  "exports": {
+    "types": "./dist/index.d.ts",
+    "peertube:tsx": "./src/index.ts",
+    "default": "./dist/index.js"
+  },
+  "type": "module",
+  "devDependencies": {},
+  "scripts": {
+    "preinstall": "pip install -r requirements.txt",
+    "build": "tsc",
+    "watch": "tsc -w"
+  },
+  "dependencies": {}
+}
--- a/packages/jiwer/requirements.txt
+++ b/packages/jiwer/requirements.txt
@ -0,0 +1 @@
+jiwer==3.0.4
--- a/packages/jiwer/src/index.ts
+++ b/packages/jiwer/src/index.ts
@ -0,0 +1 @@
+export * from './jiwer-cli.js'
--- a/packages/jiwer/src/jiwer-cli.ts
+++ b/packages/jiwer/src/jiwer-cli.ts
@ -0,0 +1,69 @@
+import { $ } from 'execa'
+
+export class JiwerClI {
+  referenceFilePath: string
+  hypothesisFilePath: string
+
+  constructor (referenceFilePath: string, hypothesisFilePath: string) {
+    this.referenceFilePath = referenceFilePath
+    this.hypothesisFilePath = hypothesisFilePath
+  }
+
+  /**
+   * @param referenceFilePath Path to new-line delimited text file of reference sentences.
+   * @param hypothesisFilePath Path to new-line delimited text file of hypothesis sentences.
+   * @param args
+   */
+  static buildArgs (referenceFilePath: string, hypothesisFilePath: string, ...args: string[]) {
+    return [
+      '--reference',
+      referenceFilePath,
+      '--hypothesis',
+      hypothesisFilePath,
+      ...args
+    ]
+  }
+
+  buildArgs (...args: string[]) {
+    return JiwerClI.buildArgs(this.referenceFilePath, this.hypothesisFilePath, ...args)
+  }
+
+  /**
+   * WER: Word Error Rate as a percentage, ex: 0.03 -> 3%
+   */
+  static async wer (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise<number> {
+    const { stdout: wer } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, global && '-g')}`
+
+    return Number(wer)
+  }
+
+  async wer (global = true) {
+    return await JiwerClI.wer(this.hypothesisFilePath, this.referenceFilePath, global)
+  }
+
+  /**
+   * CER: Character Error Rate
+   */
+  static async cer (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise<number> {
+    const { stdout: cer } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, '--cer', global && '-g')}`
+
+    return Number(cer)
+  }
+
+  async cer (global = true) {
+    return await JiwerClI.cer(this.hypothesisFilePath, this.referenceFilePath, global)
+  }
+
+  /**
+   * Print alignment of each sentence.
+   */
+  static async alignment (referenceFilePath: string, hypothesisFilePath: string, global = true): Promise<string> {
+    const { stdout: alignment } = await $`jiwer ${JiwerClI.buildArgs(referenceFilePath, hypothesisFilePath, '--align', global && '-g')}`
+
+    return alignment
+  }
+
+  async alignment (global = true) {
+    return await JiwerClI.alignment(this.hypothesisFilePath, this.referenceFilePath, global)
+  }
+}
--- a/packages/jiwer/tsconfig.json
+++ b/packages/jiwer/tsconfig.json
@ -0,0 +1,8 @@
+{
+  "extends": "../../tsconfig.base.json",
+  "compilerOptions": {
+    "outDir": "./dist",
+    "rootDir": "src",
+    "tsBuildInfoFile": "./dist/.tsbuildinfo"
+  }
+}
--- a/packages/node-utils/src/uuid.ts
+++ b/packages/node-utils/src/uuid.ts
@ -1,4 +1,4 @@
-import short from 'short-uuid'
+import short, { SUUID } from 'short-uuid'

 const translator = short()

@ -6,6 +6,10 @@ function buildUUID () {
  return short.uuid()
 }

+function buildSUUID (): SUUID {
+  return short.generate()
+}
+
 function uuidToShort (uuid: string) {
  if (!uuid) return uuid

@ -26,7 +30,10 @@ function isShortUUID (value: string) {

 export {
  buildUUID,
+  buildSUUID,
  uuidToShort,
  shortToUUID,
  isShortUUID
 }
+
+export type { SUUID }
--- a/packages/server-commands/src/requests/requests.ts
+++ b/packages/server-commands/src/requests/requests.ts
@ -59,6 +59,15 @@ export function makeRawRequest (options: {
  return makeGetRequest(reqOptions)
 }

+export const makeFileRequest = (url: string) => {
+  return makeRawRequest({
+    url,
+    responseType: 'arraybuffer',
+    redirects: 1,
+    expectedStatus: HttpStatusCode.OK_200
+  })
+}
+
 export function makeGetRequest (options: CommonRequestParams & {
  query?: any
  rawQuery?: string
--- a/packages/tests/fixtures/transcription/hello_world.zip
+++ b/packages/tests/fixtures/transcription/hello_world.zip
--- a/packages/tests/fixtures/transcription/videos/README.md
+++ b/packages/tests/fixtures/transcription/videos/README.md
@ -0,0 +1,16 @@
+🇫🇷 DRANE Occitanie - Communiquer lors d'une classe transplantée
+[./communiquer-lors-dune-classe-transplantee.mp4](communiquer-lors-dune-classe-transplantee.mp4)
+> https://podeduc.apps.education.fr/numerique-educatif/video/21893-communiquer-lors-dune-classe-transplantee/
+>
+> CC BY-NC-SA 4.0 Deed
+> Attribution-NonCommercial-ShareAlike 4.0 International
+
+🇫🇷 [Accompagner la victime d'une dérive sectaire ou d'une emprise mentale](https://www.fun-mooc.fr/fr/cours/accompagner-la-victime-de-derive-sectaire/)
+> Centre Contre les Manipulations Mentales (CCMM)
+> [CC BY-NC-ND 4.0 Deed](https://creativecommons.org/licenses/by-nc-nd/4.0/)
+> Attribution-NonCommercial-NoDerivs 4.0 International
+
+🇺🇸 [The Last Man On Earth (1964)](https://archive.org/details/TheLastManOnEarthHD)
+> PDM 1.0 Deed
+> Public Domain Mark 1.0 Universal
+> https://creativecommons.org/publicdomain/mark/1.0/
--- a/packages/tests/fixtures/transcription/videos/communiquer-lors-dune-classe-transplantee.mp4
+++ b/packages/tests/fixtures/transcription/videos/communiquer-lors-dune-classe-transplantee.mp4
--- a/packages/tests/fixtures/transcription/videos/communiquer-lors-dune-classe-transplantee.txt
+++ b/packages/tests/fixtures/transcription/videos/communiquer-lors-dune-classe-transplantee.txt
@ -0,0 +1,10 @@
+Communiquer lors d'une classe transplantée. Utiliser les photos prises lors de cette classe pour raconter quotidiennement le séjour vécu.
+C'est le scénario pédagogique présenté par Monsieur Navoli, professeur en cycle 3 sur une école élémentaire de Montpellier.
+La première application utilisée sera la médiathèque. L'enseignant va alors transférer les différentes photos réalisées lors de la classe transplantée.
+Dans un dossier spécifique pour que les élèves puissent le retrouver plus facilement. Il téléverse donc ses photos dans le dossier, dans l'ENT, dans la médiathèque de la classe.
+Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser.
+Les élèves par la suite utiliseront le blog, à partir de leurs notes, il pourront, seul ou à 2 par poste rédiger un article dans leur ENT.
+Ils illustreront ces articles à l'aide des photos et documents numériques mis en accès libre dans l'ENT.
+Pour ce faire, il pourront utiliser l'éditeur avancé qui les renverra directement dans la médiathèque de la classe, où ils pourront retrouver le dossier créé par leur enseignant.
+Une fois leur article terminé, les élèves soumettront celui-ci au professeur qui pourra soit l'annoter pour correction ou le publier.
+Ensuite, il pourront lire et commenter ceux de leurs camarades, ou répondre aux commentaires de la veille.
--- a/packages/tests/fixtures/transcription/videos/derive_sectaire.mp4
+++ b/packages/tests/fixtures/transcription/videos/derive_sectaire.mp4
--- a/packages/tests/fixtures/transcription/videos/derive_sectaire.srt
+++ b/packages/tests/fixtures/transcription/videos/derive_sectaire.srt
@ -0,0 +1,165 @@
+
+1
+00:00:03,640 --> 00:00:05,640
+-Bonjour et bienvenue sur FUN MOOC.
+
+2
+00:00:05,960 --> 00:00:09,000
+Notre MOOC "Comment parler
+à une victime d'emprise mentale
+
+3
+00:00:09,320 --> 00:00:10,400
+ou de dérive sectaire"
+
+4
+00:00:10,720 --> 00:00:13,840
+s'adresse à tout professionnel
+du domaine de la santé,
+
+5
+00:00:14,160 --> 00:00:15,920
+de l'associatif, du juridique,
+
+6
+00:00:16,240 --> 00:00:18,800
+qui pourra être en contact
+avec une victime de telles dérives.
+
+7
+00:00:21,720 --> 00:00:23,840
+Il sera composé de 14 leçons vidéo
+
+8
+00:00:24,160 --> 00:00:26,040
+d'une dizaine de minutes
+
+9
+00:00:26,360 --> 00:00:28,600
+divisées en quatre blocs.
+
+10
+00:00:31,800 --> 00:00:34,960
+Le premier bloc vous informera
+de ce que sont exactement
+
+11
+00:00:35,280 --> 00:00:37,720
+l'emprise mentale
+et une dérive sectaire.
+
+12
+00:00:38,040 --> 00:00:42,440
+-Ça consiste toujours
+en une forme de manipulation
+
+13
+00:00:43,520 --> 00:00:47,320
+qui conduit à une dépendance,
+à une sorte de cercle vicieux,
+
+14
+00:00:47,640 --> 00:00:51,200
+où les personnes ne parviennent pas
+à se désengager d'un processus
+
+15
+00:00:51,520 --> 00:00:54,120
+qui les conduit
+soit à donner de l'argent,
+
+16
+00:00:54,440 --> 00:00:56,160
+à se livrer à des actes
+
+17
+00:00:56,480 --> 00:00:58,480
+qu'en réalité
+ils n'auraient pas acceptés,
+
+18
+00:00:58,800 --> 00:01:02,160
+ou, tout simplement, à accepter
+de participer à une organisation
+
+19
+00:01:02,480 --> 00:01:03,760
+dont ils ne partagent pas
+
+20
+00:01:04,080 --> 00:01:06,040
+toutes les méthodes
+ou tous les points de vue.
+
+21
+00:01:06,360 --> 00:01:10,080
+-Le deuxième bloc vous informera
+des bonnes techniques d'écoute
+
+22
+00:01:10,400 --> 00:01:12,680
+d'une personne
+ayant vécu de tels traumatismes.
+
+23
+00:01:13,000 --> 00:01:14,760
+-C'est un sujet actuel
+
+24
+00:01:15,080 --> 00:01:17,320
+parce que ce phénomène
+est en croissance.
+
+25
+00:01:17,640 --> 00:01:20,000
+Il y a une augmentation très importante,
+un doublement,
+
+26
+00:01:20,320 --> 00:01:21,400
+en l'espace de quelques années,
+
+27
+00:01:21,720 --> 00:01:22,960
+en moins de 10 ans.
+
+28
+00:01:27,200 --> 00:01:31,000
+-Le bloc 3, lui,
+sera conçu par nos juristes
+
+29
+00:01:31,320 --> 00:01:34,080
+pour vous indiquer
+quelles sont les grandes infractions
+
+30
+00:01:34,400 --> 00:01:36,960
+en lien avec l'emprise mentale,
+
+31
+00:01:37,280 --> 00:01:39,120
+et surtout, pouvoir faire
+une analyse perspicace
+
+32
+00:01:39,440 --> 00:01:41,640
+d'une situation individuelle.
+
+33
+00:01:43,760 --> 00:01:46,960
+Enfin, le bloc 4 vous assistera
+
+34
+00:01:47,280 --> 00:01:50,320
+pour savoir comment aiguiller
+une victime
+
+35
+00:01:50,640 --> 00:01:52,400
+vers les bons professionnels.
+
+36
+00:01:53,160 --> 00:01:54,040
+Bonne formation.
+
--- a/packages/tests/fixtures/transcription/videos/derive_sectaire.txt
+++ b/packages/tests/fixtures/transcription/videos/derive_sectaire.txt
@ -0,0 +1,11 @@
+-Bonjour et bienvenue sur FUN MOOC.
+Notre MOOC "Comment parler à une victime d'emprise mentale ou de dérive sectaire" s'adresse à tout professionnel du domaine de la santé, de l'associatif, du juridique, qui pourra être en contact avec une victime de telles dérives.
+Il sera composé de 14 leçons vidéo d'une dizaine de minutes divisées en quatre blocs.
+Le premier bloc vous informera de ce que sont exactement l'emprise mentale et une dérive sectaire.
+-Ça consiste toujours en une forme de manipulation qui conduit à une dépendance, à une sorte de cercle vicieux, où les personnes ne parviennent pas à se désengager d'un processus qui les conduit soit à donner de l'argent, à se livrer à des actes qu'en réalité ils n'auraient pas acceptés, ou, tout simplement, à accepter de participer à une organisation dont ils ne partagent pas toutes les méthodes ou tous les points de vue.
+-Le deuxième bloc vous informera des bonnes techniques d'écoute d'une personne ayant vécu de tels traumatismes.
+-C'est un sujet actuel parce que ce phénomène est en croissance.
+Il y a une augmentation très importante, un doublement, en l'espace de quelques années, en moins de 10 ans.
+-Le bloc 3, lui, sera conçu par nos juristes pour vous indiquer quelles sont les grandes infractions en lien avec l'emprise mentale, et surtout, pouvoir faire une analyse perspicace d'une situation individuelle.
+Enfin, le bloc 4 vous assistera pour savoir comment aiguiller une victime vers les bons professionnels.
+Bonne formation.
--- a/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.mp4
+++ b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.mp4
--- a/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.srt
+++ b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.srt
@ -0,0 +1,17 @@
+1
+00:00:00,000 --> 00:00:01,940
+December, 1965.
+
+2
+00:00:03,460 --> 00:00:06,660
+Is that all it has been since
+I inherited the world?
+
+3
+00:00:07,020 --> 00:00:08,900
+Only three years.
+
+4
+00:00:09,940 --> 00:00:11,760
+Seems like a hundred million.
+
--- a/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.txt
+++ b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.txt
@ -0,0 +1,5 @@
+December, 1965.
+Is that all it has been since
+I inherited the world?
+Only three years.
+It seems like a hundred million.
--- a/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.vtt
+++ b/packages/tests/fixtures/transcription/videos/the_last_man_on_earth.vtt
@ -0,0 +1,14 @@
+WEBVTT
+
+00:00.000 --> 00:01.940
+December, 1965.
+
+00:03.460 --> 00:06.660
+Is that all it has been since I inherited the world?
+
+00:07.020 --> 00:08.900
+Only three years.
+
+00:09.940 --> 00:11.760
+Seems like a hundred million.
+
--- a/packages/tests/src/core-utils/date.ts
+++ b/packages/tests/src/core-utils/date.ts
@ -0,0 +1,29 @@
+import { millisecondsToTime, secondsToTime } from '@peertube/peertube-core-utils'
+import { expect } from 'chai'
+
+describe('Seconds to time', function () {
+  it('Outputs a human readable time', function () {
+    expect(secondsToTime(61.1335)).to.equals('1m1s')
+  })
+
+  it('Rounds the number of seconds to the nearest integer', function () {
+    expect(secondsToTime(61.4)).to.equals('1m1s')
+    expect(secondsToTime(61.6)).to.equals('1m2s')
+    expect(secondsToTime(61.51)).to.equals('1m2s')
+  })
+})
+
+describe('Milliseconds to time', function () {
+  it('Outputs a human readable time', function () {
+    expect(millisecondsToTime(60_000)).to.equals('1m')
+  })
+
+  it('Rounds the number of seconds to the nearest integer', function () {
+    expect(millisecondsToTime(60_100)).to.equals('1m')
+    expect(millisecondsToTime(60_501)).to.equals('1m1s')
+  })
+
+  it('Time inferior to 500ms appears as empty string', function () {
+    expect(millisecondsToTime(499)).to.equals('')
+  })
+})
--- a/packages/tests/src/jiwer/jiwer-cli.spec.ts
+++ b/packages/tests/src/jiwer/jiwer-cli.spec.ts
@ -0,0 +1,48 @@
+/* eslint-disable max-len */
+import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
+import { join } from 'path'
+import { mkdir, rm, writeFile } from 'node:fs/promises'
+import { expect } from 'chai'
+import { JiwerClI } from '@peertube/peertube-jiwer'
+
+describe('Jiwer CLI', function () {
+  const transcriptDirectory = buildAbsoluteFixturePath('transcription/transcript-evaluator')
+  const referenceTranscriptFilePath = buildAbsoluteFixturePath('transcription/videos/communiquer-lors-dune-classe-transplantee.txt')
+  const hypothesis = join(transcriptDirectory, 'openai.txt')
+  const jiwerCLI = new JiwerClI(referenceTranscriptFilePath, hypothesis)
+
+  before(async function () {
+    await mkdir(transcriptDirectory, { recursive: true })
+    await writeFile(join(transcriptDirectory, 'openai.txt'), `Communiquez lors d'une classe transplante. Utilisez les photos prises lors de cette classe pour raconter quotidiennement le séjour vécu.
+C'est le scénario P-Dagujic présenté par monsieur Navoli, professeur ainsi que le 3 sur une école alimentaire de Montpellier.
+La première application a utilisé ce ralame déatec. L'enseignant va alors transférer les différentes photos réalisés lors de la classe transplante.
+Dans un dossier, spécifique pour que les élèves puissent le retrouver plus facilement. Il téléverse donc ses photos dans le dossier, dans le venté, dans la médiatèque de la classe.
+Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser.
+Les élèves par la suite utilisera le blog. A partir de leurs nantes, il pourront se loi de parposte rédigeant un article d'un reinté.
+Ils illustront ses articles à l'aide des photos de que mon numérique mise à n'accélier dans le venté.
+Pour se faire, il pourront utiliser les diteurs avancés qui les renvèrent directement dans la médiatèque de la classe où il pourront retrouver le dossier créé par leurs enseignants.
+Une fois leur article terminée, les élèves soumétront se lui-ci au professeur qui pourra soit la noté pour correction ou le public.
+Ensuite, il pourront lire et commenter ce de leurs camarades ou répondre aux commentaires de la veille.
+`)
+  })
+
+  it(`returns coherent wer`, async function () {
+    const wer = await jiwerCLI.wer()
+    expect(wer).to.be.below(30 / 100)
+    expect(wer).to.be.greaterThan(0 / 100)
+  })
+
+  it(`returns coherent cer`, async function () {
+    const cer = await jiwerCLI.cer()
+    expect(cer).to.be.below(10 / 100)
+    expect(cer).to.be.greaterThan(9 / 100)
+  })
+
+  it(`print alignment`, async function () {
+    console.log(await jiwerCLI.alignment())
+  })
+
+  after(async function () {
+    await rm(transcriptDirectory, { recursive: true, force: true })
+  })
+})
--- a/packages/tests/src/shared/fixture-urls.ts
+++ b/packages/tests/src/shared/fixture-urls.ts
@ -29,5 +29,7 @@ export const FIXTURE_URLS = {

  chatersVideo: 'https://download.cpy.re/peertube/video_chapters.mp4',

-  file4K: 'https://download.cpy.re/peertube/4k_file.txt'
+  file4K: 'https://download.cpy.re/peertube/4k_file.txt',
+
+  transcriptionModels: 'https://download.cpy.re/peertube/transcription-models.zip'
 }
--- a/packages/tests/src/transcription/levenshtein-distance.spec.ts
+++ b/packages/tests/src/transcription/levenshtein-distance.spec.ts
@ -0,0 +1,18 @@
+import { expect } from 'chai'
+import { levenshteinDistance } from '@peertube/peertube-transcription'
+
+describe('Levenshtein distance', function () {
+  it(`equals 1 when there is only one character difference`, function () {
+    expect(levenshteinDistance('abcd', 'abce')).equals(1)
+  })
+
+  it(`may calculate a distance on a txt subtitle content `, function () {
+    expect(levenshteinDistance(`December, 1965.
+Is that all it has been since
+I inherited the world?
+Only three years.
+Seems like a hundred million.
+
+`, 'December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.')).equals(13)
+  })
+})
--- a/packages/tests/src/transcription/subtitle.spec.ts
+++ b/packages/tests/src/transcription/subtitle.spec.ts
@ -0,0 +1,33 @@
+import { srtToTxt } from '@peertube/peertube-transcription'
+import { expect } from 'chai'
+
+describe('srt to txt', function () {
+  it(`Transforms the content of a srt subtitle to a pure text version`, function () {
+    const txt = srtToTxt(`1
+00:00:00,000 --> 00:00:01,940
+December, 1965.
+
+2
+00:00:03,460 --> 00:00:06,660
+Is that all it has been since
+I inherited the world?
+
+3
+00:00:07,020 --> 00:00:08,900
+Only three years.
+
+4
+00:00:09,940 --> 00:00:11,760
+Seems like a hundred million.
+
+`)
+
+    expect(txt).equals(`December, 1965.
+Is that all it has been since
+I inherited the world?
+Only three years.
+Seems like a hundred million.
+
+`)
+  })
+})
--- a/packages/tests/src/transcription/transcriber-factory.spec.ts
+++ b/packages/tests/src/transcription/transcriber-factory.spec.ts
@ -0,0 +1,17 @@
+import { transcriberFactory } from '@peertube/peertube-transcription'
+
+describe('Transcriber factory', function () {
+  const transcribers = [
+    'openai-whisper',
+    'whisper-ctranslate2',
+    'whisper-timestamped'
+  ]
+
+  describe('Should be able to create a transcriber for each available transcription engine', function () {
+    transcribers.forEach(function (transcriberName) {
+      it(`Should be able to create a(n) ${transcriberName} transcriber`, function () {
+        transcriberFactory.createFromEngineName(transcriberName)
+      })
+    })
+  })
+})
--- a/packages/tests/src/transcription/transcript/transcript-file-evaluator.spec.ts
+++ b/packages/tests/src/transcription/transcript/transcript-file-evaluator.spec.ts
@ -0,0 +1,67 @@
+/* eslint-disable @typescript-eslint/no-unused-expressions, no-new, max-len */
+import { TranscriptFile, TranscriptFileEvaluator } from '@peertube/peertube-transcription'
+import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
+import { join } from 'node:path'
+import { mkdir, rm } from 'node:fs/promises'
+import { tmpdir } from 'node:os'
+import { expect } from 'chai'
+
+describe('Transcript File Evaluator', function () {
+  const transcriptDirectory = join(tmpdir(), 'peertube-transcription', 'transcript-file-evaluator')
+  const referenceTranscriptFilePath = buildAbsoluteFixturePath('transcription/videos/communiquer-lors-dune-classe-transplantee.txt')
+
+  before(async function () {
+    await mkdir(transcriptDirectory, { recursive: true })
+  })
+
+  it(`may not compare files in another format than txt`, async function () {
+    const vttReference = await TranscriptFile.write({
+      path: join(transcriptDirectory, 'reference.vtt'),
+      format: 'vtt',
+      content: ''
+    })
+    const vttHypothesis = await TranscriptFile.write({
+      path: join(transcriptDirectory, 'hypothesis.vtt'),
+      format: 'vtt',
+      content: ''
+    })
+    expect(() => new TranscriptFileEvaluator(vttReference, vttHypothesis)).to.throw('Can only evaluate txt transcript file')
+  })
+
+  it(`evaluation must return coherent wer & cer`, async function () {
+    const reference = new TranscriptFile({
+      path: referenceTranscriptFilePath,
+      language: 'fr',
+      format: 'txt'
+    })
+    const hypothesis = await TranscriptFile.write({
+      path: join(transcriptDirectory, 'openai.txt'),
+      content: `Communiquez lors d'une classe transplante. Utilisez les photos prises lors de cette classe pour raconter quotidiennement le séjour vécu.
+C'est le scénario P-Dagujic présenté par monsieur Navoli, professeur ainsi que le 3 sur une école alimentaire de Montpellier.
+La première application a utilisé ce ralame déatec. L'enseignant va alors transférer les différentes photos réalisés lors de la classe transplante.
+Dans un dossier, spécifique pour que les élèves puissent le retrouver plus facilement. Il téléverse donc ses photos dans le dossier, dans le venté, dans la médiatèque de la classe.
+Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser.
+Les élèves par la suite utilisera le blog. A partir de leurs nantes, il pourront se loi de parposte rédigeant un article d'un reinté.
+Ils illustront ses articles à l'aide des photos de que mon numérique mise à n'accélier dans le venté.
+Pour se faire, il pourront utiliser les diteurs avancés qui les renvèrent directement dans la médiatèque de la classe où il pourront retrouver le dossier créé par leurs enseignants.
+Une fois leur article terminée, les élèves soumétront se lui-ci au professeur qui pourra soit la noté pour correction ou le public.
+Ensuite, il pourront lire et commenter ce de leurs camarades ou répondre aux commentaires de la veille.
+`,
+      format: 'txt',
+      language: 'fr'
+    })
+    const evaluator = new TranscriptFileEvaluator(reference, hypothesis)
+    const wer = await evaluator.wer()
+    expect(wer).to.be.greaterThan(0 / 100)
+    expect(wer).to.be.below(30 / 100)
+
+    const cer = await evaluator.cer()
+    expect(cer).to.be.greaterThan(9 / 100)
+    expect(cer).to.be.below(10 / 100)
+    console.log(await evaluator.alignment())
+  })
+
+  after(async function () {
+    await rm(transcriptDirectory, { recursive: true, force: true })
+  })
+})
--- a/packages/tests/src/transcription/transcript/transcript-file.spec.ts
+++ b/packages/tests/src/transcription/transcript/transcript-file.spec.ts
@ -0,0 +1,44 @@
+/* eslint-disable @typescript-eslint/no-unused-expressions */
+import { expect } from 'chai'
+import { join } from 'node:path'
+import { mkdir, rm } from 'node:fs/promises'
+import { TranscriptFile } from '@peertube/peertube-transcription'
+import { tmpdir } from 'node:os'
+import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
+
+describe('Transcript File', function () {
+  const transcriptFileDirectory = join(tmpdir(), 'peertube-transcription', 'transcript-file')
+  before(async function () {
+    await mkdir(transcriptFileDirectory, { recursive: true })
+  })
+
+  it(`may creates a new transcript file from scratch`, async function () {
+    const transcript1 = await TranscriptFile.write({
+      path: join(transcriptFileDirectory, 'test1.txt'),
+      content: 'test2',
+      format: 'txt'
+    })
+    const transcript2 = await TranscriptFile.write({
+      path: join(transcriptFileDirectory, 'test2.txt'),
+      content: 'test2',
+      format: 'txt'
+    })
+
+    expect(await transcript1.equals(transcript2)).to.be.true
+  })
+
+  it(`may creates a txt transcript file object from a transcript without providing the format explicitly`, function () {
+    TranscriptFile.fromPath(buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.srt'), 'en')
+    TranscriptFile.fromPath(buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.txt'), 'en')
+  })
+
+  it(`fails when loading a file which is obviously not a transcript`, function () {
+
+    expect(() => TranscriptFile.fromPath(buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.mp4'), 'en'))
+      .to.throw(`Couldn't guess transcript format from extension "mp4". Valid formats are: txt, vtt, srt.`)
+  })
+
+  after(async function () {
+    await rm(transcriptFileDirectory, { recursive: true, force: true })
+  })
+})
--- a/packages/tests/src/transcription/transcription-run.spec.ts
+++ b/packages/tests/src/transcription/transcription-run.spec.ts
@ -0,0 +1 @@
+describe('Transcription run', function () {})
--- a/packages/tests/src/transcription/utils.spec.ts
+++ b/packages/tests/src/transcription/utils.spec.ts
@ -0,0 +1,44 @@
+import { cp, lstat, mkdir, rm } from 'node:fs/promises'
+import { join } from 'node:path'
+import { tmpdir } from 'node:os'
+import { expect } from 'chai'
+import { downloadFile, unzip } from '@peertube/peertube-transcription'
+import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
+
+describe('downloadFile', function () {
+  const testDirectory = join(tmpdir(), 'peertube-transcription', 'utils')
+  before(async function () {
+    await mkdir(testDirectory, { recursive: true })
+  })
+
+  it(`Downloads a file and write it to the disk `, async function () {
+    const filePath = await downloadFile('https://download.cpy.re/peertube/4k_file.txt', testDirectory)
+
+    expect(await lstat(filePath).then(stats => stats.isFile())).equals(true)
+  })
+
+  after(async function () {
+    await rm(testDirectory, { recursive: true, force: true })
+  })
+})
+
+describe('unzip', function () {
+  const zipFixtureFileName = 'hello_world.zip'
+  const zipFixtureFilePath = buildAbsoluteFixturePath(`transcription/${zipFixtureFileName}`)
+  const testDirectory = join(tmpdir(), 'peertube-transcription', 'utils')
+  before(async function () {
+    await mkdir(testDirectory, { recursive: true })
+  })
+
+  it(`Extract zip archive to directory`, async function () {
+    const zipFilePath = join(testDirectory, zipFixtureFileName)
+    await cp(zipFixtureFilePath, zipFilePath)
+    const unzippedDirectory = await unzip(zipFilePath)
+
+    expect(await lstat(unzippedDirectory).then(stats => stats.isDirectory())).equals(true)
+  })
+
+  after(async function () {
+    await rm(testDirectory, { recursive: true, force: true })
+  })
+})
--- a/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts
+++ b/packages/tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts
@ -0,0 +1,125 @@
+/* eslint-disable @typescript-eslint/no-unused-expressions, max-len */
+import { expect, config } from 'chai'
+import { createLogger } from 'winston'
+import { join } from 'node:path'
+import { mkdir, rm } from 'node:fs/promises'
+import { tmpdir } from 'node:os'
+import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
+import {
+  downloadFile,
+  levenshteinDistance,
+  OpenaiTranscriber,
+  TranscriptFile,
+  TranscriptFileEvaluator,
+  TranscriptionModel,
+  unzip,
+  WhisperBuiltinModel
+} from '@peertube/peertube-transcription'
+import { FIXTURE_URLS } from '@tests/shared/fixture-urls.js'
+
+config.truncateThreshold = 0
+
+describe('Open AI Whisper transcriber', function () {
+  const tmpDirectory = join(tmpdir(), 'peertube-transcription')
+  const transcriptDirectory = join(tmpDirectory, 'transcriber', 'openai')
+  const modelsDirectory = join(tmpDirectory, 'models')
+  const shortVideoPath = buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.mp4')
+  const frVideoPath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4')
+  const referenceTranscriptFile = new TranscriptFile({
+    path: buildAbsoluteFixturePath('transcription/videos/derive_sectaire.txt'),
+    language: 'fr',
+    format: 'txt'
+  })
+  const transcriber = new OpenaiTranscriber(
+    {
+      name: 'openai-whisper',
+      requirements: [],
+      type: 'binary',
+      binary: 'whisper',
+      supportedModelFormats: [ 'PyTorch' ],
+      languageDetection: true
+    },
+    createLogger(),
+    transcriptDirectory
+  )
+
+  before(async function () {
+    this.timeout(1 * 1000 * 60)
+    await mkdir(transcriptDirectory, { recursive: true })
+    await unzip(await downloadFile(FIXTURE_URLS.transcriptionModels, tmpDirectory))
+  })
+
+  it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () {
+    this.timeout(3 * 1000 * 60)
+    const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en' })
+
+    expect(transcript.format).to.equals('vtt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('May produce a transcript file in the `srt` format', async function () {
+    const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'srt' })
+
+    expect(transcript.format).to.equals('srt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('May produce a transcript file in the `txt` format', async function () {
+    const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'txt' })
+
+    expect(transcript.format).to.equals('txt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+    expect(levenshteinDistance(
+      (await transcript.read()).toString(),
+      'December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.'
+    )).to.be.below(3)
+  })
+
+  it('May transcribe a media file using a local PyTorch model', async function () {
+    this.timeout(2 * 1000 * 60)
+    await transcriber.transcribe({
+      mediaFilePath: shortVideoPath,
+      model: await TranscriptionModel.fromPath(join(modelsDirectory, 'tiny.pt')),
+      language: 'en'
+    })
+  })
+
+  it('May transcribe a media file in french', async function () {
+    this.timeout(3 * 1000 * 60)
+    const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath, language: 'fr', format: 'txt' })
+
+    expect(transcript.format).to.equals('txt')
+    expect(transcript.language).to.equals('fr')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('Guesses the video language if not provided', async function () {
+    this.timeout(3 * 1000 * 60)
+    const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath })
+
+    expect(transcript.language).to.equals('fr')
+  })
+
+  it('May transcribe a media file in french with small model', async function () {
+    this.timeout(6 * 1000 * 60)
+    const transcript = await transcriber.transcribe({
+      mediaFilePath: frVideoPath,
+      language: 'fr',
+      format: 'txt',
+      model: new WhisperBuiltinModel('small')
+    })
+
+    expect(transcript.language).to.equals('fr')
+
+    const transcriptFileEvaluator = new TranscriptFileEvaluator(referenceTranscriptFile, transcript)
+    const cer = await transcriptFileEvaluator.cer()
+    expect(cer).to.be.below(6 / 100)
+  })
+
+  after(async function () {
+    await rm(transcriptDirectory, { recursive: true, force: true })
+  })
+})
--- a/packages/tests/src/transcription/whisper/transcriber/timestamped-transcriber.spec.ts
+++ b/packages/tests/src/transcription/whisper/transcriber/timestamped-transcriber.spec.ts
@ -0,0 +1,133 @@
+/* eslint-disable @typescript-eslint/no-unused-expressions, max-len */
+import { expect, config } from 'chai'
+import { createLogger } from 'winston'
+import { join } from 'node:path'
+import { mkdir, rm } from 'node:fs/promises'
+import { tmpdir } from 'node:os'
+import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
+import {
+  OpenaiTranscriber,
+  WhisperTimestampedTranscriber,
+  TranscriptFileEvaluator,
+  TranscriptionModel,
+  WhisperTranscribeArgs,
+  levenshteinDistance, downloadFile, unzip
+} from '@peertube/peertube-transcription'
+import { FIXTURE_URLS } from '@tests/shared/fixture-urls.js'
+
+config.truncateThreshold = 0
+
+describe('Linto timestamped Whisper transcriber', function () {
+  const tmpDirectory = join(tmpdir(), 'peertube-transcription')
+  const transcriptDirectory = join(tmpDirectory, 'transcriber', 'timestamped')
+  const modelsDirectory = join(tmpDirectory, 'models')
+  const shortVideoPath = buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.mp4')
+  const frVideoPath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4')
+  const transcriber = new WhisperTimestampedTranscriber(
+    {
+      name: 'whisper-timestamped',
+      requirements: [],
+      type: 'binary',
+      binary: 'whisper_timestamped',
+      supportedModelFormats: [ 'PyTorch' ],
+      languageDetection: true
+    },
+    createLogger(),
+    transcriptDirectory
+  )
+
+  before(async function () {
+    this.timeout(1 * 1000 * 60)
+    await mkdir(transcriptDirectory, { recursive: true })
+    await unzip(await downloadFile(FIXTURE_URLS.transcriptionModels, tmpDirectory))
+  })
+
+  it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () {
+    this.timeout(1 * 1000 * 60)
+    const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en' })
+
+    expect(transcript.format).to.equals('vtt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('May produce a transcript file in the `srt` format with a ms precision', async function () {
+    const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'srt' })
+
+    expect(transcript.format).to.equals('srt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('May produce a transcript file in `txt` format', async function () {
+    const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'txt' })
+
+    expect(transcript.format).to.equals('txt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+    expect(levenshteinDistance(
+      (await transcript.read()).toString(),
+      'December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.'
+    )).to.be.below(10)
+  })
+
+  it('May transcribe a media file using a local PyTorch model file', async function () {
+    this.timeout(2 * 1000 * 60)
+    await transcriber.transcribe({
+      mediaFilePath: shortVideoPath,
+      model: await TranscriptionModel.fromPath(join(modelsDirectory, 'tiny.pt')),
+      language: 'en'
+    })
+  })
+
+  it('May transcribe a media file in french', async function () {
+    this.timeout(2 * 1000 * 60)
+    const transcript = await transcriber.transcribe({
+      mediaFilePath: frVideoPath,
+      language: 'fr',
+      format: 'txt'
+    })
+
+    expect(transcript.format).to.equals('txt')
+    expect(transcript.language).to.equals('fr')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('Guesses the video language if not provided', async function () {
+    this.timeout(2 * 1000 * 60)
+    const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath })
+    expect(transcript.language).to.equals('fr')
+  })
+
+  it('Should produce a text transcript similar to openai-whisper implementation', async function () {
+    this.timeout(11 * 1000 * 60)
+    const transcribeArgs: WhisperTranscribeArgs = {
+      mediaFilePath: frVideoPath,
+      model: await TranscriptionModel.fromPath(join(modelsDirectory, 'tiny.pt')),
+      language: 'fr',
+      format: 'txt'
+    }
+    const transcript = await transcriber.transcribe(transcribeArgs)
+
+    const openaiTranscriber = new OpenaiTranscriber(
+      {
+        name: 'openai-whisper',
+        requirements: [],
+        type: 'binary',
+        binary: 'whisper',
+        supportedModelFormats: [ 'PyTorch' ]
+      },
+      createLogger(),
+      join(transcriptDirectory, 'openai-whisper')
+    )
+    const openaiTranscript = await openaiTranscriber.transcribe(transcribeArgs)
+
+    const transcriptFileEvaluator = new TranscriptFileEvaluator(openaiTranscript, transcript)
+    expect(await transcriptFileEvaluator.wer()).to.be.below(25 / 100)
+    expect(await transcriptFileEvaluator.cer()).to.be.below(15 / 100)
+  })
+
+  after(async function () {
+    await rm(transcriptDirectory, { recursive: true, force: true })
+  })
+})
--- a/packages/tests/src/transcription/whisper/transcriber/whisper-ctranslate2.spec.ts
+++ b/packages/tests/src/transcription/whisper/transcriber/whisper-ctranslate2.spec.ts
@ -0,0 +1,137 @@
+/* eslint-disable @typescript-eslint/no-unused-expressions, max-len */
+import { expect, config } from 'chai'
+import { createLogger } from 'winston'
+import { join } from 'node:path'
+import { mkdir, rm } from 'node:fs/promises'
+import { tmpdir } from 'node:os'
+import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
+import {
+  Ctranslate2Transcriber, downloadFile,
+  levenshteinDistance,
+  OpenaiTranscriber,
+  TranscriptFile,
+  TranscriptFileEvaluator,
+  TranscriptionModel, unzip,
+  WhisperTranscribeArgs
+} from '@peertube/peertube-transcription'
+import { FIXTURE_URLS } from '@tests/shared/fixture-urls.js'
+
+config.truncateThreshold = 0
+
+describe('Whisper CTranslate2 transcriber', function () {
+  const tmpDirectory = join(tmpdir(), 'peertube-transcription')
+  const transcriptDirectory = join(tmpDirectory, 'transcriber', 'ctranslate2')
+  const modelsDirectory = join(tmpDirectory, 'models')
+  const shortVideoPath = buildAbsoluteFixturePath('transcription/videos/the_last_man_on_earth.mp4')
+  const frVideoPath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4')
+  const transcriber = new Ctranslate2Transcriber(
+    {
+      name: 'anyNameShouldBeFineReally',
+      requirements: [],
+      type: 'binary',
+      binary: 'whisper-ctranslate2',
+      supportedModelFormats: [],
+      languageDetection: true
+    },
+    createLogger(),
+    transcriptDirectory
+  )
+
+  before(async function () {
+    this.timeout(1 * 1000 * 60)
+    await mkdir(transcriptDirectory, { recursive: true })
+    await unzip(await downloadFile(FIXTURE_URLS.transcriptionModels, tmpDirectory))
+  })
+
+  it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () {
+    const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en' })
+
+    expect(transcript.format).to.equals('vtt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('May produce a transcript file in the `srt` format', async function () {
+    const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'srt' })
+
+    expect(transcript.format).to.equals('srt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('May produce a transcript file in the `txt` format', async function () {
+    const transcript = await transcriber.transcribe({ mediaFilePath: shortVideoPath, language: 'en', format: 'txt' })
+    expect(await transcript.equals(new TranscriptFile({
+      path: join(transcriptDirectory, 'the_last_man_on_earth.txt'),
+      format: 'txt',
+      language: 'en'
+    }))).to.be.true
+
+    expect(transcript.format).to.equals('txt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+    expect(levenshteinDistance(
+      (await transcript.read()).toString(),
+      'December 1965, is that all it has been since I inherited the world only three years, seems like a hundred million.'
+    )).to.be.below(5)
+  })
+
+  it('May transcribe a media file using a local CTranslate2 model', async function () {
+    this.timeout(2 * 1000 * 60)
+    const transcript = await transcriber.transcribe({
+      mediaFilePath: shortVideoPath,
+      model: await TranscriptionModel.fromPath(join(modelsDirectory, 'faster-whisper-tiny')),
+      language: 'en',
+      format: 'txt'
+    })
+
+    expect(transcript.format).to.equals('txt')
+    expect(transcript.language).to.equals('en')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('May transcribe a media file in french', async function () {
+    this.timeout(5 * 1000 * 60)
+    const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath, language: 'fr', format: 'txt' })
+
+    expect(transcript.format).to.equals('txt')
+    expect(transcript.language).to.equals('fr')
+    expect(await transcript.read()).not.to.be.empty
+  })
+
+  it('Guesses the video language if not provided', async function () {
+    this.timeout(2 * 1000 * 60)
+    const transcript = await transcriber.transcribe({ mediaFilePath: frVideoPath })
+    expect(transcript.language).to.equals('fr')
+  })
+
+  it('Should produce a text transcript similar to openai-whisper implementation', async function () {
+    this.timeout(10 * 1000 * 60)
+    const transcribeArgs: WhisperTranscribeArgs = {
+      mediaFilePath: frVideoPath,
+      language: 'fr',
+      format: 'txt'
+    }
+    const transcript = await transcriber.transcribe(transcribeArgs)
+    const openaiTranscriber = new OpenaiTranscriber(
+      {
+        name: 'openai-whisper',
+        requirements: [],
+        type: 'binary',
+        binary: 'whisper',
+        supportedModelFormats: [ 'PyTorch' ]
+      },
+      createLogger(),
+      join(transcriptDirectory, 'openai-whisper')
+    )
+    const openaiTranscript = await openaiTranscriber.transcribe(transcribeArgs)
+
+    const transcriptFileEvaluator = new TranscriptFileEvaluator(openaiTranscript, transcript)
+    expect(await transcriptFileEvaluator.wer()).to.be.below(20 / 100)
+    expect(await transcriptFileEvaluator.cer()).to.be.below(10 / 100)
+  })
+
+  after(async function () {
+    await rm(transcriptDirectory, { recursive: true, force: true })
+  })
+})
--- a/packages/tests/tsconfig.json
+++ b/packages/tests/tsconfig.json
@ -6,16 +6,20 @@
    "tsBuildInfoFile": "./dist/.tsbuildinfo",
    "paths": {
      "@tests/*": [ "./src/*" ],
-      "@server/*": [ "../../server/core/*" ]
+      "@server/*": [ "../../server/core/*" ],
+      "@peertube/peertube-transcription": [ "../transcription" ],
+      "@peertube/peertube-jiwer": [ "../jiwer" ],
    }
  },
  "references": [
    { "path": "../core-utils" },
    { "path": "../ffmpeg" },
+    { "path": "../jiwer" },
    { "path": "../models" },
    { "path": "../node-utils" },
    { "path": "../typescript-utils" },
    { "path": "../server-commands" },
+    { "path": "../transcription" },
    { "path": "../../server/tsconfig.lib.json" }
  ],
  "include": [
--- a/packages/transcription/README.md
+++ b/packages/transcription/README.md
@ -0,0 +1,99 @@
+# Transcription
+
+Video **transcription** consists in transcribing the audio content of a video to a text.
+> This process might be called __Automatic Speech Recognition__ or __Speech to Text__ in more general context.
+
+Provide a common API to many transcription backend, currently :
+- `openai-whisper` CLI
+- `faster-whisper` (*via* `whisper-ctranslate2` CLI)
+- `whisper-timestamped`
+
+> Potential candidates could be: whisper-cpp, vosk, ...
+
+## Requirements
+- Python
+- PIP
+
+And at least one of the following transcription backend:
+- Python :
+  - `openai-whisper`
+  - `whisper-ctranslate2>=0.4.3`
+  - `whisper-timestamped>=1.15.4`
+
+And to run the transcript evaluation tests :
+- Python
+  - `jiwer>=3.04`
+
+## Usage
+
+Create a transcriber manually :
+```typescript
+import { OpenaiTranscriber } from '@peertube/peertube-transcription'
+
+(async () => {
+  // create a transcriber powered by OpeanAI Whisper CLI
+  const transcriber = new OpenaiTranscriber({
+    name: 'openai-whisper',
+    binary: 'whisper',
+    languageDetection: true
+  });
+
+  const transcriptFile = await transcriber.transcribe({
+    mediaFilePath: './myVideo.mp4',
+    format: 'txt'
+  });
+
+  console.log(transcriptFile.path);
+  console.log(await transcriptFile.read());
+})();
+```
+
+Using a local model file:
+
+```typescript
+import { WhisperBuiltinModel } from '@peertube/peertube-transcription/dist'
+
+const transcriptFile = await transcriber.transcribe({
+  mediaFilePath: './myVideo.mp4',
+  model: WhisperBuiltinModel.fromPath('./models/large.pt'),
+  format: 'txt'
+});
+```
+
+You may use the builtin Factory if you're happy with the default configuration:
+```Typescript
+import { transcriberFactory } from '@peertube/peertube-transcription'
+transcriberFactory.createFromEngineName('openai-whisper')
+```
+> For further usage [../tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts](../tests/src/transcription/whisper/transcriber/openai-transcriber.spec.ts)
+
+## Benchmark
+
+A benchmark of available __transcribers__ might be run with:
+```sh
+npm run benchmark
+```
+```
+┌────────────────────────┬───────────────────────┬───────────────────────┬──────────┬────────┬───────────────────────┐
+│        (index)         │          WER          │          CER          │ duration │ model  │        engine         │
+├────────────────────────┼───────────────────────┼───────────────────────┼──────────┼────────┼───────────────────────┤
+│ 5yZGBYqojXe7nuhq1TuHvz │ '28.39506172839506%'  │  '9.62457337883959%'  │  '41s'   │ 'tiny' │   'openai-whisper'    │
+│ x6qREJ2AkTU4e5YmvfivQN │ '29.75206611570248%'  │ '10.46195652173913%'  │  '15s'   │ 'tiny' │ 'whisper-ctranslate2' │
+│ qbt6BekKMVzxq4KCSLCzt3 │ '31.020408163265305%' │ '10.784982935153584%' │  '20s'   │ 'tiny' │ 'whisper-timestamped' │
+└────────────────────────┴───────────────────────┴───────────────────────┴──────────┴────────┴───────────────────────┘
+```
+
+The benchmark may be run with multiple model builtin sizes:
+```sh
+MODELS=tiny,small,large npm run benchmark
+```
+
+## Lexicon
+- ONNX: Open Neural Network eXchange. A specification, the ONNX Runtime run these models.
+- GPTs: Generative Pre-Trained Transformers
+- LLM: Large Language Models
+- NLP: Natural Language Processing
+- MLP: Multilayer Perceptron
+- ASR: Automatic Speech Recognition
+- WER: Word Error Rate
+- CER: Character Error Rate
--- a/packages/transcription/package.json
+++ b/packages/transcription/package.json
@ -0,0 +1,21 @@
+{
+  "name": "@peertube/peertube-transcription",
+  "private": true,
+  "version": "0.0.0",
+  "main": "dist/index.js",
+  "files": [ "dist" ],
+  "exports": {
+    "types": "./dist/index.d.ts",
+    "peertube:tsx": "./src/index.ts",
+    "default": "./dist/index.js"
+  },
+  "type": "module",
+  "devDependencies": {},
+  "scripts": {
+    "preinstall": "pip install -r requirements.txt",
+    "build": "tsc",
+    "watch": "tsc -w",
+    "benchmark": "tsx --conditions=peertube:tsx --tsconfig ./tsconfig.json ./src/benchmark.ts"
+  },
+  "dependencies": {}
+}
--- a/packages/transcription/requirements.txt
+++ b/packages/transcription/requirements.txt
@ -0,0 +1,3 @@
+openai-whisper==20231117
+whisper-ctranslate2==0.4.4
+whisper-timestamped==1.15.4
--- a/packages/transcription/src/abstract-transcriber.ts
+++ b/packages/transcription/src/abstract-transcriber.ts
@ -0,0 +1,69 @@
+import { createLogger, Logger } from 'winston'
+import { join } from 'node:path'
+import { PerformanceObserver } from 'node:perf_hooks'
+import { buildSUUID, SUUID, root } from '@peertube/peertube-node-utils'
+import { TranscriptionEngine } from './transcription-engine.js'
+import { TranscriptionModel } from './transcription-model.js'
+import { TranscriptionRun } from './transcription-run.js'
+import { TranscriptFile, TranscriptFormat } from './transcript/index.js'
+
+export interface TranscribeArgs {
+  mediaFilePath: string
+  model: TranscriptionModel
+  language?: string
+  format?: TranscriptFormat
+  runId?: SUUID
+}
+
+export abstract class AbstractTranscriber {
+  public static DEFAULT_TRANSCRIPT_DIRECTORY = join(root(), 'dist', 'transcripts')
+
+  engine: TranscriptionEngine
+  logger: Logger
+  transcriptDirectory: string
+  performanceObserver?: PerformanceObserver
+  run?: TranscriptionRun
+
+  constructor (
+    engine: TranscriptionEngine,
+    logger: Logger = createLogger(),
+    transcriptDirectory: string = AbstractTranscriber.DEFAULT_TRANSCRIPT_DIRECTORY,
+    performanceObserver?: PerformanceObserver
+  ) {
+    this.engine = engine
+    this.logger = logger
+    this.transcriptDirectory = transcriptDirectory
+    this.performanceObserver = performanceObserver
+  }
+
+  createRun (uuid: SUUID = buildSUUID()) {
+    this.run = new TranscriptionRun(this.logger, uuid)
+  }
+
+  startRun () {
+    this.run.start()
+  }
+
+  stopRun () {
+    this.run.stop()
+    delete this.run
+  }
+
+  assertLanguageDetectionAvailable (language?: string) {
+    if (!this.engine.languageDetection && !language) {
+      throw new Error(`Language detection isn't available in ${this.engine.name}. A language must me provided explicitly.`)
+    }
+  }
+
+  supports (model: TranscriptionModel) {
+    return model.format === 'PyTorch'
+  }
+
+  abstract transcribe ({
+    mediaFilePath,
+    model,
+    language,
+    format = 'vtt',
+    runId = buildSUUID()
+  }: TranscribeArgs): Promise<TranscriptFile>
+}
--- a/packages/transcription/src/benchmark.ts
+++ b/packages/transcription/src/benchmark.ts
@ -0,0 +1,139 @@
+import { createLogger, transports, format } from 'winston'
+import { join } from 'node:path'
+import { performance, PerformanceObserver } from 'node:perf_hooks'
+import { tmpdir } from 'node:os'
+import { rm, mkdir } from 'node:fs/promises'
+import { buildAbsoluteFixturePath, buildSUUID, SUUID } from '@peertube/peertube-node-utils'
+import {
+  transcriberFactory,
+  TranscriptFile,
+  TranscriptFileEvaluator,
+  TranscriptionEngine,
+  TranscriptionModel
+} from '@peertube/peertube-transcription'
+import { millisecondsToTime } from '@peertube/peertube-core-utils'
+
+interface BenchmarkResult {
+  uuid: SUUID
+  WER?: number
+  CER?: number
+  duration?: number
+  engine?: TranscriptionEngine
+  model?: string
+}
+
+type Benchmark = Record<SUUID, BenchmarkResult>
+
+const benchmarkReducer = (benchmark: Benchmark = {}, benchmarkResult: BenchmarkResult) => ({
+  ...benchmark,
+  [benchmarkResult.uuid]:  {
+    ...benchmark[benchmarkResult.uuid],
+    ...benchmarkResult
+  }
+})
+
+const groupBenchmarkResultsByModel = (benchmarkResults: Record<string, BenchmarkResult>) => (benchmarksGroupedByModel, uuid) => ({
+  ...benchmarksGroupedByModel,
+  [benchmarkResults[uuid].model]: {
+    ...benchmarksGroupedByModel[benchmarkResults[uuid].model],
+    [uuid]: formatBenchmarkResult(benchmarkResults[uuid])
+  }
+})
+
+interface FormattedBenchmarkResult {
+  WER?: string
+  CER?: string
+  duration?: string
+  model?: string
+  engine?: string
+}
+
+const formatBenchmarkResult = ({ WER, CER, duration, engine, model }: Partial<BenchmarkResult>): FormattedBenchmarkResult => ({
+  WER: WER ? `${WER * 100}%` : undefined,
+  CER: CER ? `${CER * 100}%` : undefined,
+  duration: duration ? millisecondsToTime(duration) : undefined,
+  model,
+  engine: engine.name
+})
+
+void (async () => {
+  const logger = createLogger()
+  logger.add(new transports.Console({ format: format.printf(log => log.message) }))
+  const transcribers = [
+    'openai-whisper',
+    'whisper-ctranslate2',
+    'whisper-timestamped'
+  ]
+  const models = process.env.MODELS
+    ? process.env.MODELS.trim().split(',').map(modelName => modelName.trim()).filter(modelName => modelName)
+    : [ 'tiny' ]
+
+  const transcriptDirectory = join(tmpdir(), 'peertube-transcription', 'benchmark')
+  const mediaFilePath = buildAbsoluteFixturePath('transcription/videos/derive_sectaire.mp4')
+  const referenceTranscriptFile = new TranscriptFile({
+    path: buildAbsoluteFixturePath('transcription/videos/derive_sectaire.txt'),
+    language: 'fr',
+    format: 'txt'
+  })
+
+  let benchmarkResults: Record<string, BenchmarkResult> = {}
+
+  // before
+  await mkdir(transcriptDirectory, { recursive: true })
+  const performanceObserver = new PerformanceObserver((items) => {
+    items
+      .getEntries()
+      .forEach((entry) => {
+        benchmarkResults = benchmarkReducer(benchmarkResults, {
+          uuid: entry.name as SUUID,
+          duration: entry.duration
+        })
+      })
+  })
+  performanceObserver.observe({ type: 'measure' })
+
+  // benchmark
+  logger.info(`Running transcribers benchmark with the following models: ${models.join(', ')}`)
+  for (const transcriberName of transcribers) {
+    logger.info(`Create "${transcriberName}" transcriber for the benchmark...`)
+
+    const transcriber = transcriberFactory.createFromEngineName(
+      transcriberName,
+      createLogger(),
+      transcriptDirectory
+    )
+
+    for (const modelName of models) {
+      logger.info(`Run benchmark with "${modelName}" model:`)
+      const model = new TranscriptionModel(modelName)
+      const uuid = buildSUUID()
+      const transcriptFile = await transcriber.transcribe({
+        mediaFilePath,
+        model,
+        language: 'fr',
+        format: 'txt',
+        runId: uuid
+      })
+      const evaluator = new TranscriptFileEvaluator(referenceTranscriptFile, transcriptFile)
+      await new Promise(resolve => setTimeout(resolve, 1))
+
+      benchmarkResults = benchmarkReducer(benchmarkResults, {
+        uuid,
+        engine: transcriber.engine,
+        WER: await evaluator.wer(),
+        CER: await evaluator.cer(),
+        model: model.name
+      })
+    }
+  }
+
+  // display
+  const benchmarkResultsGroupedByModel = Object
+    .keys(benchmarkResults)
+    .reduce(groupBenchmarkResultsByModel(benchmarkResults), {})
+  Object.values(benchmarkResultsGroupedByModel).forEach(benchmark => console.table(benchmark))
+
+  // after
+  await rm(transcriptDirectory, { recursive: true, force: true })
+  performance.clearMarks()
+})()
--- a/packages/transcription/src/index.ts
+++ b/packages/transcription/src/index.ts
@ -0,0 +1,13 @@
+import { TranscriberFactory } from './transcriber-factory.js'
+import { engines } from './whisper/index.js'
+
+export * from './transcript/index.js'
+export * from './levenshtein.js'
+export * from './subtitle.js'
+export * from './transcription-engine.js'
+export * from './transcription-model.js'
+export * from './transcription-run.js'
+export * from './utils.js'
+export * from './whisper/index.js'
+
+export const transcriberFactory = new TranscriberFactory(engines)
--- a/packages/transcription/src/levenshtein.ts
+++ b/packages/transcription/src/levenshtein.ts
@ -0,0 +1,101 @@
+function min (d0: number, d1: number, d2: number, bx: number, ay: number) {
+  return d0 < d1 || d2 < d1
+    ? d0 > d2
+      ? d2 + 1
+      : d0 + 1
+    : bx === ay
+      ? d1
+      : d1 + 1
+}
+
+/**
+ * @see https://github.com/gustf/js-levenshtein
+ */
+export function levenshteinDistance (a: string, b: string): number {
+  if (a === b) {
+    return 0
+  }
+
+  if (a.length > b.length) {
+    const tmp = a
+    a = b
+    b = tmp
+  }
+
+  let la = a.length
+  let lb = b.length
+
+  while (la > 0 && (a.charCodeAt(la - 1) === b.charCodeAt(lb - 1))) {
+    la--
+    lb--
+  }
+
+  let offset = 0
+
+  while (offset < la && (a.charCodeAt(offset) === b.charCodeAt(offset))) {
+    offset++
+  }
+
+  la -= offset
+  lb -= offset
+
+  if (la === 0 || lb < 3) {
+    return lb
+  }
+
+  let x = 0
+  let y: number
+  let d0: number
+  let d1: number
+  let d2: number
+  let d3: number
+  let dd: number
+  let dy: number
+  let ay: number
+  let bx0: number
+  let bx1: number
+  let bx2: number
+  let bx3: number
+
+  const vector: number[] = []
+
+  for (y = 0; y < la; y++) {
+    vector.push(y + 1)
+    vector.push(a.charCodeAt(offset + y))
+  }
+
+  const len = vector.length - 1
+
+  for (; x < lb - 3;) {
+    bx0 = b.charCodeAt(offset + (d0 = x))
+    bx1 = b.charCodeAt(offset + (d1 = x + 1))
+    bx2 = b.charCodeAt(offset + (d2 = x + 2))
+    bx3 = b.charCodeAt(offset + (d3 = x + 3))
+    dd = (x += 4)
+    for (y = 0; y < len; y += 2) {
+      dy = vector[y]
+      ay = vector[y + 1]
+      d0 = min(dy, d0, d1, bx0, ay)
+      d1 = min(d0, d1, d2, bx1, ay)
+      d2 = min(d1, d2, d3, bx2, ay)
+      dd = min(d2, d3, dd, bx3, ay)
+      vector[y] = dd
+      d3 = d2
+      d2 = d1
+      d1 = d0
+      d0 = dy
+    }
+  }
+
+  for (; x < lb;) {
+    bx0 = b.charCodeAt(offset + (d0 = x))
+    dd = ++x
+    for (y = 0; y < len; y += 2) {
+      dy = vector[y]
+      vector[y] = dd = min(dy, d0, dd, bx0, vector[y + 1])
+      d0 = dy
+    }
+  }
+
+  return dd
+}
--- a/packages/transcription/src/subtitle.ts
+++ b/packages/transcription/src/subtitle.ts
@ -0,0 +1 @@
+export const srtToTxt = (srtContent: string) => srtContent.replace(/^\n*\d+\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n/gm, '')
--- a/packages/transcription/src/transcriber-factory.ts
+++ b/packages/transcription/src/transcriber-factory.ts
@ -0,0 +1,49 @@
+import { Logger, createLogger } from 'winston'
+import { TranscriptionEngine } from './transcription-engine.js'
+import {
+  Ctranslate2Transcriber,
+  OpenaiTranscriber, WhisperTimestampedTranscriber
+} from './whisper/index.js'
+import { AbstractTranscriber } from './abstract-transcriber.js'
+
+export class TranscriberFactory {
+  engines: TranscriptionEngine[]
+
+  constructor (engines: TranscriptionEngine[]) {
+    this.engines = engines
+  }
+
+  createFromEngineName (
+    engineName: string,
+    logger: Logger = createLogger(),
+    transcriptDirectory: string = AbstractTranscriber.DEFAULT_TRANSCRIPT_DIRECTORY
+  ) {
+    const engine = this.getEngineByName(engineName)
+
+    const transcriberArgs: ConstructorParameters<typeof AbstractTranscriber> = [
+      engine,
+      logger,
+      transcriptDirectory
+    ]
+
+    switch (engineName) {
+      case 'openai-whisper':
+        return new OpenaiTranscriber(...transcriberArgs)
+      case 'whisper-ctranslate2':
+        return new Ctranslate2Transcriber(...transcriberArgs)
+      case 'whisper-timestamped':
+        return new WhisperTimestampedTranscriber(...transcriberArgs)
+      default:
+        throw new Error(`Unimplemented engine ${engineName}`)
+    }
+  }
+
+  getEngineByName (engineName: string) {
+    const engine = this.engines.find(({ name }) => name === engineName)
+    if (!engine) {
+      throw new Error(`Unknow engine ${engineName}`)
+    }
+
+    return engine
+  }
+}
--- a/packages/transcription/src/transcript/index.ts
+++ b/packages/transcription/src/transcript/index.ts
@ -0,0 +1,3 @@
+export * from './transcript-file.js'
+export * from './transcript-file-evaluator.js'
+export * from './transcript-file-interface.js'
--- a/packages/transcription/src/transcript/transcript-file-evaluator-interface.ts
+++ b/packages/transcription/src/transcript/transcript-file-evaluator-interface.ts
@ -0,0 +1,12 @@
+export interface TranscriptFileEvaluation {
+  wer: number
+  cer: number
+  alignment: string
+}
+
+export interface TranscriptFileEvaluatorInterface {
+  wer(): Promise<number>
+  cer(): Promise<number>
+  alignment(): Promise<string>
+  evaluate(): Promise<TranscriptFileEvaluation>
+}
--- a/packages/transcription/src/transcript/transcript-file-evaluator.ts
+++ b/packages/transcription/src/transcript/transcript-file-evaluator.ts
@ -0,0 +1,46 @@
+import assert from 'node:assert'
+import { JiwerClI } from '@peertube/peertube-jiwer'
+import { TranscriptFileEvaluatorInterface } from './transcript-file-evaluator-interface.js'
+import { TranscriptFileInterface } from './index.js'
+
+export class TranscriptFileEvaluator implements TranscriptFileEvaluatorInterface {
+  referenceTranscriptFile: TranscriptFileInterface
+  hypothesisTranscriptFile: TranscriptFileInterface
+  jiwerCLI: JiwerClI
+
+  constructor (referenceTranscriptFile: TranscriptFileInterface, hypothesisTranscriptFile: TranscriptFileInterface) {
+    assert(referenceTranscriptFile.format === 'txt', 'Can only evaluate txt transcript file')
+    assert(hypothesisTranscriptFile.format === 'txt', 'Can only evaluate txt transcript file')
+
+    this.referenceTranscriptFile = referenceTranscriptFile
+    this.hypothesisTranscriptFile = hypothesisTranscriptFile
+
+    this.jiwerCLI = new JiwerClI(this.referenceTranscriptFile.path, this.hypothesisTranscriptFile.path)
+  }
+
+  /**
+   * WER: Word Error Rate
+   */
+  wer () {
+    return this.jiwerCLI.wer()
+  }
+
+  /**
+   * CER: Character Error Rate
+   */
+  cer () {
+    return this.jiwerCLI.cer()
+  }
+
+  alignment () {
+    return this.jiwerCLI.alignment()
+  }
+
+  async evaluate () {
+    return {
+      wer: await this.wer(),
+      cer: await this.cer(),
+      alignment: await this.alignment()
+    }
+  }
+}
--- a/packages/transcription/src/transcript/transcript-file-interface.ts
+++ b/packages/transcription/src/transcript/transcript-file-interface.ts
@ -0,0 +1,3 @@
+export type TranscriptFormat = 'txt' | 'vtt' | 'srt' | 'json'
+
+export type TranscriptFileInterface = { path: string, language?: string, format: TranscriptFormat }
--- a/packages/transcription/src/transcript/transcript-file.ts
+++ b/packages/transcription/src/transcript/transcript-file.ts
@ -0,0 +1,88 @@
+import { statSync } from 'node:fs'
+import { readFile, writeFile } from 'node:fs/promises'
+import { extname } from 'node:path'
+import assert from 'node:assert'
+import { TranscriptFileInterface, TranscriptFormat } from './transcript-file-interface.js'
+import { TranscriptFileEvaluator } from './transcript-file-evaluator.js'
+import { srtToTxt } from '../subtitle.js'
+import { levenshteinDistance } from '../levenshtein.js'
+
+export class TranscriptFile implements TranscriptFileInterface {
+  path: string
+  language: string
+  format: TranscriptFormat = 'vtt'
+
+  constructor ({ path, language, format = 'vtt' }: { path: string, language: string, format?: TranscriptFormat }) {
+    statSync(path)
+
+    this.path = path
+    this.language = language
+    this.format = format
+  }
+
+  /**
+   * Asynchronously reads the entire contents of a transcript file.
+   * @see https://nodejs.org/docs/latest-v18.x/api/fs.html#filehandlereadfileoptions for options
+   */
+  async read (options: Parameters<typeof readFile>[1] = 'utf8') {
+    return await readFile(this.path, options)
+  }
+
+  static fromPath (path: string, language = 'en') {
+    const format = extname(path).substring(1)
+
+    const guessableFormats = [ 'txt', 'vtt', 'srt' ]
+    assert(
+      guessableFormats.includes(format),
+      `Couldn't guess transcript format from extension "${format}". Valid formats are: ${guessableFormats.join(', ')}."`)
+
+    return new TranscriptFile({ path, language, format: format as TranscriptFormat })
+  }
+
+  /**
+   * Write a transcript file to disk.
+   */
+  static async write ({
+    path,
+    content,
+    language = 'en',
+    format = 'vtt'
+  }: { path: string, content: string, language?: string, format?: TranscriptFormat }): Promise<TranscriptFile> {
+    await writeFile(path, content)
+
+    return new TranscriptFile({ path, language, format })
+  }
+
+  async equals (transcript: TranscriptFile, caseSensitive: boolean = true) {
+    if (this.language !== transcript.language) {
+      return false
+    }
+
+    const content = await this.read()
+    const transcriptContent = await transcript.read()
+
+    if (!caseSensitive) {
+      return String(content).toLowerCase() === String(transcriptContent).toLowerCase()
+    }
+
+    return content === transcriptContent
+  }
+
+  cer (transcript: TranscriptFile) {
+    return (new TranscriptFileEvaluator(this, transcript)).cer()
+  }
+
+  async evaluate (transcript: TranscriptFile) {
+    const evaluator = new TranscriptFileEvaluator(this, transcript)
+
+    return evaluator.evaluate()
+  }
+
+  async readAsTxt () {
+    return srtToTxt(String(await this.read()))
+  }
+
+  async distance (transcript: TranscriptFile) {
+    return levenshteinDistance(await this.readAsTxt(), await transcript.readAsTxt())
+  }
+}
--- a/packages/transcription/src/transcription-engine.ts
+++ b/packages/transcription/src/transcription-engine.ts
@ -0,0 +1,23 @@
+import { ModelFormat } from './transcription-model.js'
+
+/**
+ * The engine, or framework.
+ */
+export class TranscriptionEngine {
+  name: string
+  description?: string
+  language?: string
+  requirements: string[]
+  type: 'binary' | 'bindings' | 'ws'
+  binary: string
+  license?: string
+  forgeURL?: string
+  supportedModelFormats: ModelFormat[]
+  languageDetection?: true
+  // There could be a default models.
+  // There could be a list of default models
+
+  constructor (parameters: TranscriptionEngine) {
+    Object.assign(this, parameters)
+  }
+}
--- a/packages/transcription/src/transcription-model.ts
+++ b/packages/transcription/src/transcription-model.ts
@ -0,0 +1,34 @@
+import assert from 'node:assert'
+import { stat } from 'node:fs/promises'
+import { parse } from 'node:path'
+
+export type ModelFormat = 'PyTorch' | 'GGML' | 'ONNX' | 'CTranslate2' // CoreML, OpenVino, Scikit-Learn, TensorFlow/Keras, PySpark
+
+export class TranscriptionModel {
+  name: string
+  format?: ModelFormat
+  path?: string
+
+  // #  - hparams
+  // #  - Number of dimensions (int)
+  // #  - Name length (int)
+  // #  - Dimensions (int[n_dims])
+  // #  - Name (char[name_length])
+  // #  - Data (float[n_dims])
+
+  // #  - mel filters
+  // #  - tokenizer vocab
+  // #  - model variables
+
+  constructor (name: string, path?: string, format?: ModelFormat) {
+    this.name = name
+    this.path = path
+    this.format = format
+  }
+
+  static async fromPath (path: string) {
+    assert(await stat(path), `${path} doesn't exist.`)
+
+    return new TranscriptionModel(parse(path).name, path)
+  }
+}
--- a/packages/transcription/src/transcription-run.ts
+++ b/packages/transcription/src/transcription-run.ts
@ -0,0 +1,41 @@
+import { buildSUUID, SUUID } from '@peertube/peertube-node-utils'
+import { createLogger, Logger } from 'winston'
+
+export class TranscriptionRun {
+  uuid: SUUID
+  logger: Logger
+
+  constructor (logger = createLogger(), uuid: SUUID = buildSUUID()) {
+    this.uuid = uuid
+    this.logger = logger
+  }
+
+  get runId () {
+    return this.uuid
+  }
+
+  start () {
+    performance.mark(this.getStartPerformanceMarkName())
+  }
+
+  stop () {
+    try {
+      performance.mark(this.getEndPerformanceMarkName())
+      performance.measure(
+        this.runId,
+        this.getStartPerformanceMarkName(),
+        this.getEndPerformanceMarkName()
+      )
+    } catch (e) {
+      this.logger.log({ level: 'error', message: e })
+    }
+  }
+
+  getStartPerformanceMarkName () {
+    return `${this.runId}-started`
+  }
+
+  getEndPerformanceMarkName () {
+    return `${this.runId}-ended`
+  }
+}
--- a/packages/transcription/src/utils.ts
+++ b/packages/transcription/src/utils.ts
@ -0,0 +1,32 @@
+import { join, parse } from 'node:path'
+import { createWriteStream } from 'node:fs'
+import { lstat, unlink } from 'node:fs/promises'
+import assert from 'node:assert'
+import { $ } from 'execa'
+import { makeFileRequest } from '@peertube/peertube-server-commands'
+
+export const downloadFile = async (url: string, targetDirectory: string) => {
+  const { base } = parse(url)
+  const filePath = join(targetDirectory, base)
+
+  const fileStream = createWriteStream(filePath)
+  const stream = makeFileRequest(url).pipe(fileStream)
+
+  return await new Promise((resolve: (filePath: string) => void, reject) => {
+    stream.on('finish', () => resolve(filePath))
+    stream.on('error', async e => {
+      fileStream.close()
+      await unlink(filePath)
+      reject(e.message)
+    })
+  })
+}
+
+export const unzip = async (zipFilePath: string) => {
+  assert(await lstat(zipFilePath).then(stats => stats.isFile()), `${zipFilePath} isn't a file.`)
+  const { dir, name } = parse(zipFilePath)
+
+  await $`unzip -o ${zipFilePath} -d ${dir}`
+
+  return join(dir, name)
+}
--- a/packages/transcription/src/whisper/README.md
+++ b/packages/transcription/src/whisper/README.md
--- a/packages/transcription/src/whisper/engines.ts
+++ b/packages/transcription/src/whisper/engines.ts
@ -0,0 +1,51 @@
+import { TranscriptionEngine } from '../transcription-engine.js'
+
+export const engines: TranscriptionEngine[] = [
+  {
+    name : 'whisper-cpp',
+    description : 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model',
+    type: 'binary',
+    binary: 'main',
+    language : 'cpp',
+    requirements : [],
+    forgeURL : 'https://github.com/ggerganov/whisper.cpp',
+    license : 'MIT',
+    supportedModelFormats: [ 'ONNX' ]
+  },
+  {
+    name: 'openai-whisper',
+    description: 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model',
+    requirements: [ 'python', 'pyTorch', 'ffmpeg' ],
+    language: 'python',
+    type: 'binary',
+    binary: 'whisper',
+    forgeURL: 'https://github.com/openai/whisper',
+    license: 'MIT',
+    supportedModelFormats: [ 'PyTorch' ],
+    languageDetection: true
+  },
+  {
+    name: 'whisper-ctranslate2',
+    description: '',
+    requirements: [ 'python' ],
+    language: 'python',
+    type: 'binary',
+    binary: 'whisper-ctranslate2',
+    forgeURL: 'https://github.com/openai/whisper',
+    license: 'MIT',
+    supportedModelFormats: [ 'CTranslate2' ],
+    languageDetection: true
+  },
+  {
+    name: 'whisper-timestamped',
+    description: '',
+    requirements: [ 'python' ],
+    language: 'python',
+    type: 'binary',
+    binary: 'whisper_timestamped',
+    forgeURL: 'https://github.com/openai/whisper',
+    license: 'MIT',
+    supportedModelFormats: [ 'CTranslate2' ],
+    languageDetection: true
+  }
+]
--- a/packages/transcription/src/whisper/index.ts
+++ b/packages/transcription/src/whisper/index.ts
@ -0,0 +1,3 @@
+export * from './transcriber/index.js'
+export * from './engines.js'
+export * from './whisper-builtin-model.js'
--- a/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts
+++ b/packages/transcription/src/whisper/transcriber/ctranslate2-transcriber.ts
@ -0,0 +1,49 @@
+import { $ } from 'execa'
+import { buildSUUID } from '@peertube/peertube-node-utils'
+import { lstat } from 'node:fs/promises'
+import { OpenaiTranscriber, WhisperTranscribeArgs } from './openai-transcriber.js'
+import { TranscriptFile } from '../../transcript/index.js'
+import { WhisperBuiltinModel } from '../whisper-builtin-model.js'
+import assert from 'node:assert'
+
+export class Ctranslate2Transcriber extends OpenaiTranscriber {
+  async transcribe ({
+    mediaFilePath,
+    model = new WhisperBuiltinModel('tiny'),
+    language,
+    format = 'vtt',
+    runId = buildSUUID()
+  }: WhisperTranscribeArgs): Promise<TranscriptFile> {
+    this.assertLanguageDetectionAvailable(language)
+
+    const $$ = $({ verbose: process.env.NODE_ENV !== 'production' })
+
+    if (model.path) {
+      assert(await lstat(model.path).then(stats => stats.isDirectory()), 'Model path must be a path to a directory.')
+    }
+
+    const modelArgs = model.path ? [ '--model_directory', model.path ] : [ '--model', model.name ]
+    const languageArgs = language ? [ '--language', language ] : []
+
+    this.createRun(runId)
+    this.startRun()
+    await $$`${this.engine.binary} ${[
+      mediaFilePath,
+      ...modelArgs,
+      '--word_timestamps',
+      'True',
+      '--output_format',
+      'all',
+      '--output_dir',
+      this.transcriptDirectory,
+      ...languageArgs
+    ]}`
+    this.stopRun()
+
+    return new TranscriptFile({
+      language: language || await this.getDetectedLanguage(mediaFilePath),
+      path: this.getTranscriptFilePath(mediaFilePath, format),
+      format
+    })
+  }
+}
--- a/packages/transcription/src/whisper/transcriber/index.ts
+++ b/packages/transcription/src/whisper/transcriber/index.ts
@ -0,0 +1,3 @@
+export * from './ctranslate2-transcriber.js'
+export * from './openai-transcriber.js'
+export * from './timestamped-transcriber.js'
--- a/packages/transcription/src/whisper/transcriber/openai-transcriber.ts
+++ b/packages/transcription/src/whisper/transcriber/openai-transcriber.ts
@ -0,0 +1,62 @@
+import { join } from 'path'
+import { $ } from 'execa'
+import { buildSUUID } from '@peertube/peertube-node-utils'
+import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js'
+import { AbstractTranscriber, TranscribeArgs } from '../../abstract-transcriber.js'
+import { WhisperBuiltinModel } from '../whisper-builtin-model.js'
+import { TranscriptionModel } from '../../transcription-model.js'
+import { readFile } from 'node:fs/promises'
+import { parse } from 'node:path'
+
+export type WhisperTranscribeArgs = Omit<TranscribeArgs, 'model'> & { model?: TranscriptionModel }
+
+export class OpenaiTranscriber extends AbstractTranscriber {
+  async transcribe ({
+    mediaFilePath,
+    model = new WhisperBuiltinModel('tiny'),
+    language,
+    format = 'vtt',
+    runId = buildSUUID()
+  }: WhisperTranscribeArgs): Promise<TranscriptFile> {
+    this.assertLanguageDetectionAvailable(language)
+
+    const $$ = $({ verbose: process.env.NODE_ENV !== 'production' })
+    const languageArgs = language ? [ '--language', language ] : []
+
+    this.createRun(runId)
+    this.startRun()
+    await $$`${this.engine.binary} ${[
+      mediaFilePath,
+      '--word_timestamps',
+      'True',
+      '--model',
+      model?.path || model.name,
+      '--output_format',
+      'all',
+      '--output_dir',
+      this.transcriptDirectory,
+      ...languageArgs
+    ]}`
+    this.stopRun()
+
+    return new TranscriptFile({
+      language: language || await this.getDetectedLanguage(mediaFilePath),
+      path: this.getTranscriptFilePath(mediaFilePath, format),
+      format
+    })
+  }
+
+  async getDetectedLanguage (mediaFilePath: string) {
+    const { language } = await this.readJsonTranscriptFile(mediaFilePath)
+
+    return language
+  }
+
+  async readJsonTranscriptFile (mediaFilePath: string) {
+    return JSON.parse(await readFile(this.getTranscriptFilePath(mediaFilePath, 'json'), 'utf8'))
+  }
+
+  getTranscriptFilePath (mediaFilePath: string, format: TranscriptFormat) {
+    return join(this.transcriptDirectory, `${parse(mediaFilePath).name}.${format}`)
+  }
+}
--- a/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts
+++ b/packages/transcription/src/whisper/transcriber/timestamped-transcriber.ts
@ -0,0 +1,55 @@
+import { $ } from 'execa'
+import { buildSUUID } from '@peertube/peertube-node-utils'
+import assert from 'node:assert'
+import { join, parse } from 'node:path'
+import { existsSync } from 'node:fs'
+import { rename } from 'node:fs/promises'
+import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js'
+import { OpenaiTranscriber, WhisperTranscribeArgs } from './openai-transcriber.js'
+import { WhisperBuiltinModel } from '../whisper-builtin-model.js'
+
+export class WhisperTimestampedTranscriber extends OpenaiTranscriber {
+  async transcribe ({
+    mediaFilePath,
+    model = new WhisperBuiltinModel('tiny'),
+    language,
+    format = 'vtt',
+    runId = buildSUUID()
+  }: WhisperTranscribeArgs): Promise<TranscriptFile> {
+    this.assertLanguageDetectionAvailable(language)
+
+    const $$ = $({ verbose: process.env.NODE_ENV !== 'production' })
+    const languageArgs = language ? [ '--language', language ] : []
+
+    this.createRun(runId)
+    this.startRun()
+    await $$`${this.engine.binary} ${[
+      mediaFilePath,
+      '--model',
+      model?.path || model.name,
+      '--output_format',
+      'all',
+      '--output_dir',
+      this.transcriptDirectory,
+      ...languageArgs
+    ]}`
+    this.stopRun()
+
+    const internalTranscriptPath = this.getTranscriptFilePath(mediaFilePath, format, false)
+    const transcriptPath = join(this.transcriptDirectory, `${parse(mediaFilePath).name}.${format}`)
+    // Whisper timestamped output files with the video file extension by defaults, ex: video.mp4.vtt
+    // @see https://github.com/linto-ai/whisper-timestamped/issues/189
+    assert(existsSync(internalTranscriptPath), `${internalTranscriptPath} file doesn't exist.`)
+    await rename(internalTranscriptPath, transcriptPath)
+    // communiquer-lors-dune-classe-transplantee.mp4.words.json
+    return new TranscriptFile({
+      language: language || await this.getDetectedLanguage(mediaFilePath),
+      path: transcriptPath,
+      format
+    })
+  }
+
+  getTranscriptFilePath (mediaFilePath: string, format: TranscriptFormat, words = true) {
+    return join(this.transcriptDirectory, `${parse(mediaFilePath).base}${words ? '.words' : ''}.${format}`)
+  }
+}
--- a/packages/transcription/src/whisper/whisper-builtin-model.ts
+++ b/packages/transcription/src/whisper/whisper-builtin-model.ts
@ -0,0 +1,11 @@
+import { TranscriptionModel } from '../transcription-model.js'
+
+export type WhisperBuiltinModelName = 'tiny' | 'base' | 'small' | 'medium' | 'large' | 'large-v2' | 'large-v3'
+
+export class WhisperBuiltinModel extends TranscriptionModel {
+
+  // eslint-disable-next-line @typescript-eslint/no-useless-constructor
+  constructor (name: WhisperBuiltinModelName) {
+    super(name)
+  }
+}
--- a/packages/transcription/tsconfig.json
+++ b/packages/transcription/tsconfig.json
@ -0,0 +1,15 @@
+{
+  "extends": "../../tsconfig.base.json",
+  "compilerOptions": {
+    "outDir": "./dist",
+    "rootDir": "src",
+    "tsBuildInfoFile": "./dist/.tsbuildinfo"
+  },
+  "references": [
+    { "path": "../models" },
+    { "path": "../core-utils" },
+    { "path": "../node-utils" },
+    { "path": "../jiwer" },
+    { "path": "../server-commands" }
+  ]
+}
--- a/packages/transcription/tsconfig.types.json
+++ b/packages/transcription/tsconfig.types.json
@ -0,0 +1,10 @@
+{
+  "extends": "./tsconfig.json",
+  "compilerOptions": {
+    "outDir": "../types-generator/dist/peertube-transcription",
+    "tsBuildInfoFile": "../types-generator/dist/peertube-transcription/.tsbuildinfo",
+    "stripInternal": true,
+    "removeComments": false,
+    "emitDeclarationOnly": true
+  }
+}
--- a/scripts/ci.sh
+++ b/scripts/ci.sh
@ -146,4 +146,13 @@ elif [ "$1" = "lint" ]; then
    npm run swagger-cli -- validate support/doc/api/openapi.yaml

    ( cd client && npm run lint )
+elif [ "$1" = "transcription" ]; then
+    npm run preinstall --workspace=@peertube/peertube-transcription --workspace=@peertube/peertube-jiwer
+    npm run build:server
+    npm run build:tests
+
+    transcriptionFiles=$(findTestFiles ./packages/tests/dist/transcription)
+    jiwerFiles=$(findTestFiles ./packages/tests/dist/jiwer)
+
+    MOCHA_PARALLEL=true runJSTest "$1" $((3*$speedFactor)) $transcriptionFiles $jiwerFiles
 fi
--- a/server/tsconfig.json
+++ b/server/tsconfig.json
@ -14,6 +14,7 @@
    { "path": "../packages/ffmpeg" },
    { "path": "../packages/models" },
    { "path": "../packages/node-utils" },
+    { "path": "../packages/transcription" },
    { "path": "../packages/typescript-utils" }
  ],
  "include": [
--- a/tsconfig.eslint.json
+++ b/tsconfig.eslint.json
@ -24,9 +24,11 @@
    { "path": "./apps/peertube-cli" },
    { "path": "./packages/core-utils" },
    { "path": "./packages/ffmpeg" },
+    { "path": "./packages/jiwer" },
    { "path": "./packages/models" },
    { "path": "./packages/node-utils" },
    { "path": "./packages/server-commands" },
+    { "path": "./packages/transcription" },
    { "path": "./packages/typescript-utils" }
  ]
 }
				`@ -0,0 +1 @@`
				`describe('Transcription run', function () {})`
				`@ -0,0 +1 @@`
				`export const srtToTxt = (srtContent: string) => srtContent.replace(/^\n*\d+\n\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n/gm, '')`