mirror of https://github.com/Chocobozzz/PeerTube
Merge 154eba502f
into 46b45dc51d
commit
ef6a3da32f
|
@ -0,0 +1 @@
|
|||
packages/tests/fixtures/transcription/models/**/* linguist-generated=true
|
|
@ -0,0 +1,36 @@
|
|||
name: Transcription
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- transcription-backend-workbench
|
||||
|
||||
jobs:
|
||||
|
||||
test:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
- uses: './.github/actions/reusable-prepare-peertube-build'
|
||||
with:
|
||||
node-version: '18.x'
|
||||
|
||||
- uses: './.github/actions/reusable-prepare-peertube-run'
|
||||
|
||||
- name: Install Python libraries
|
||||
run: |
|
||||
pip3 install openai-whisper
|
||||
pip3 install whisper-ctranslate2
|
||||
pip3 install whisper-timestamped
|
||||
pip3 install jiwer
|
||||
|
||||
- name: Run transcription tests
|
||||
run: |
|
||||
npm run mocha -- --exit --bail "packages/tests/src/transcription/**/*.spec.ts"
|
||||
cat /proc/cpuinfo
|
||||
lscpu
|
||||
dmidecode --type processor
|
||||
lshw -C CPU
|
||||
hwinfo --cpu
|
|
@ -0,0 +1,6 @@
|
|||
🇫🇷 DRANE Occitanie - Communiquer lors d'une classe transplantée
|
||||
[./communiquer-lors-dune-classe-transplantee.mp4](videos/communiquer-lors-dune-classe-transplantee.mp4)
|
||||
> https://podeduc.apps.education.fr/numerique-educatif/video/21893-communiquer-lors-dune-classe-transplantee/
|
||||
>
|
||||
> CC BY-NC-SA 4.0 Deed
|
||||
> Attribution-NonCommercial-ShareAlike 4.0 International
|
223
packages/tests/fixtures/transcription/models/faster-whisper-tiny/config.json
generated
vendored
Normal file
223
packages/tests/fixtures/transcription/models/faster-whisper-tiny/config.json
generated
vendored
Normal file
|
@ -0,0 +1,223 @@
|
|||
{
|
||||
"alignment_heads": [
|
||||
[
|
||||
2,
|
||||
2
|
||||
],
|
||||
[
|
||||
3,
|
||||
0
|
||||
],
|
||||
[
|
||||
3,
|
||||
2
|
||||
],
|
||||
[
|
||||
3,
|
||||
3
|
||||
],
|
||||
[
|
||||
3,
|
||||
4
|
||||
],
|
||||
[
|
||||
3,
|
||||
5
|
||||
]
|
||||
],
|
||||
"lang_ids": [
|
||||
50259,
|
||||
50260,
|
||||
50261,
|
||||
50262,
|
||||
50263,
|
||||
50264,
|
||||
50265,
|
||||
50266,
|
||||
50267,
|
||||
50268,
|
||||
50269,
|
||||
50270,
|
||||
50271,
|
||||
50272,
|
||||
50273,
|
||||
50274,
|
||||
50275,
|
||||
50276,
|
||||
50277,
|
||||
50278,
|
||||
50279,
|
||||
50280,
|
||||
50281,
|
||||
50282,
|
||||
50283,
|
||||
50284,
|
||||
50285,
|
||||
50286,
|
||||
50287,
|
||||
50288,
|
||||
50289,
|
||||
50290,
|
||||
50291,
|
||||
50292,
|
||||
50293,
|
||||
50294,
|
||||
50295,
|
||||
50296,
|
||||
50297,
|
||||
50298,
|
||||
50299,
|
||||
50300,
|
||||
50301,
|
||||
50302,
|
||||
50303,
|
||||
50304,
|
||||
50305,
|
||||
50306,
|
||||
50307,
|
||||
50308,
|
||||
50309,
|
||||
50310,
|
||||
50311,
|
||||
50312,
|
||||
50313,
|
||||
50314,
|
||||
50315,
|
||||
50316,
|
||||
50317,
|
||||
50318,
|
||||
50319,
|
||||
50320,
|
||||
50321,
|
||||
50322,
|
||||
50323,
|
||||
50324,
|
||||
50325,
|
||||
50326,
|
||||
50327,
|
||||
50328,
|
||||
50329,
|
||||
50330,
|
||||
50331,
|
||||
50332,
|
||||
50333,
|
||||
50334,
|
||||
50335,
|
||||
50336,
|
||||
50337,
|
||||
50338,
|
||||
50339,
|
||||
50340,
|
||||
50341,
|
||||
50342,
|
||||
50343,
|
||||
50344,
|
||||
50345,
|
||||
50346,
|
||||
50347,
|
||||
50348,
|
||||
50349,
|
||||
50350,
|
||||
50351,
|
||||
50352,
|
||||
50353,
|
||||
50354,
|
||||
50355,
|
||||
50356,
|
||||
50357
|
||||
],
|
||||
"suppress_ids": [
|
||||
1,
|
||||
2,
|
||||
7,
|
||||
8,
|
||||
9,
|
||||
10,
|
||||
14,
|
||||
25,
|
||||
26,
|
||||
27,
|
||||
28,
|
||||
29,
|
||||
31,
|
||||
58,
|
||||
59,
|
||||
60,
|
||||
61,
|
||||
62,
|
||||
63,
|
||||
90,
|
||||
91,
|
||||
92,
|
||||
93,
|
||||
359,
|
||||
503,
|
||||
522,
|
||||
542,
|
||||
873,
|
||||
893,
|
||||
902,
|
||||
918,
|
||||
922,
|
||||
931,
|
||||
1350,
|
||||
1853,
|
||||
1982,
|
||||
2460,
|
||||
2627,
|
||||
3246,
|
||||
3253,
|
||||
3268,
|
||||
3536,
|
||||
3846,
|
||||
3961,
|
||||
4183,
|
||||
4667,
|
||||
6585,
|
||||
6647,
|
||||
7273,
|
||||
9061,
|
||||
9383,
|
||||
10428,
|
||||
10929,
|
||||
11938,
|
||||
12033,
|
||||
12331,
|
||||
12562,
|
||||
13793,
|
||||
14157,
|
||||
14635,
|
||||
15265,
|
||||
15618,
|
||||
16553,
|
||||
16604,
|
||||
18362,
|
||||
18956,
|
||||
20075,
|
||||
21675,
|
||||
22520,
|
||||
26130,
|
||||
26161,
|
||||
26435,
|
||||
28279,
|
||||
29464,
|
||||
31650,
|
||||
32302,
|
||||
32470,
|
||||
36865,
|
||||
42863,
|
||||
47425,
|
||||
49870,
|
||||
50254,
|
||||
50258,
|
||||
50358,
|
||||
50359,
|
||||
50360,
|
||||
50361,
|
||||
50362
|
||||
],
|
||||
"suppress_ids_begin": [
|
||||
220,
|
||||
50257
|
||||
]
|
||||
}
|
BIN
packages/tests/fixtures/transcription/models/faster-whisper-tiny/model.bin
generated
vendored
Normal file
BIN
packages/tests/fixtures/transcription/models/faster-whisper-tiny/model.bin
generated
vendored
Normal file
Binary file not shown.
114853
packages/tests/fixtures/transcription/models/faster-whisper-tiny/tokenizer.json
generated
vendored
Normal file
114853
packages/tests/fixtures/transcription/models/faster-whisper-tiny/tokenizer.json
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
51867
packages/tests/fixtures/transcription/models/faster-whisper-tiny/vocabulary.json
generated
vendored
Normal file
51867
packages/tests/fixtures/transcription/models/faster-whisper-tiny/vocabulary.json
generated
vendored
Normal file
File diff suppressed because it is too large
Load Diff
Binary file not shown.
|
@ -0,0 +1,10 @@
|
|||
Communiquer lors d'une classe transplantée. Utiliser les photos prises lors de cette classe pour raconter quotidiennement le séjour vécu.
|
||||
C'est le scénario pédagogique présenté par Monsieur Navoli, professeur en cycle 3 sur une école élémentaire de Montpellier.
|
||||
La première application utilisée sera la médiathèque. L'enseignant va alors transférer les différentes photos réalisées lors de la classe transplantée.
|
||||
Dans un dossier spécifique pour que les élèves puissent le retrouver plus facilement. Il téléverse donc ses photos dans le dossier, dans l'ENT, dans la médiathèque de la classe.
|
||||
Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser.
|
||||
Les élèves par la suite utiliseront le blog, à partir de leurs notes, il pourront, seul ou à 2 par poste rédiger un article dans leur ENT.
|
||||
Ils illustreront ces articles à l'aide des photos et documents numériques mis en accès libre dans l'ENT.
|
||||
Pour ce faire, il pourront utiliser l'éditeur avancé qui les renverra directement dans la médiathèque de la classe, où ils pourront retrouver le dossier créé par leur enseignant.
|
||||
Une fois leur article terminé, les élèves soumettront celui-ci au professeur qui pourra soit l'annoter pour correction ou le publier.
|
||||
Ensuite, il pourront lire et commenter ceux de leurs camarades, ou répondre aux commentaires de la veille.
|
BIN
packages/tests/fixtures/transcription/videos/communiquer-lors-dune-classe-transplantee.mp4
vendored
Normal file
BIN
packages/tests/fixtures/transcription/videos/communiquer-lors-dune-classe-transplantee.mp4
vendored
Normal file
Binary file not shown.
|
@ -0,0 +1,125 @@
|
|||
import { createLogger } from 'winston'
|
||||
import { performance, PerformanceObserver } from 'node:perf_hooks'
|
||||
// import { CpuInfo, CpuUsage } from 'node:os'
|
||||
import { rm, mkdir } from 'node:fs/promises'
|
||||
import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
|
||||
import {
|
||||
toHumanReadable,
|
||||
transcriberFactory,
|
||||
TranscriptFile,
|
||||
TranscriptFileEvaluator,
|
||||
TranscriptionEngine
|
||||
} from '@peertube/peertube-transcription'
|
||||
|
||||
const WER_TOLERANCE = 0.01
|
||||
const CER_TOLERANCE = 0.001
|
||||
|
||||
interface TestResult {
|
||||
uuid: string
|
||||
WER: number
|
||||
CER: number
|
||||
duration: number
|
||||
engine: TranscriptionEngine
|
||||
model: string
|
||||
// dataThroughput: number // relevant ?
|
||||
// cpus: CpuInfo[] // https://nodejs.org/docs/latest-v18.x/api/os.html#oscpus
|
||||
// cpuUsages: CpuUsage[] // https://nodejs.org/docs/latest-v18.x/api/process.html#processcpuusagepreviousvalue
|
||||
// // os.totalmem()
|
||||
// // os.freemem()
|
||||
// memoryUsages: Record<number, MemoryUsage> // https://nodejs.org/docs/latest-v18.x/api/process.html#processmemoryusage
|
||||
}
|
||||
|
||||
const benchmarkReducer = (benchmark: Record<string, Partial<TestResult>> = {}, engineName: string, testResult: Partial<TestResult>) => ({
|
||||
...benchmark,
|
||||
[engineName]: {
|
||||
...benchmark[engineName],
|
||||
...testResult
|
||||
}
|
||||
})
|
||||
|
||||
interface FormattedTestResult {
|
||||
WER?: string
|
||||
CER?: string
|
||||
duration?: string
|
||||
model?: string
|
||||
}
|
||||
|
||||
const formatTestResult = ({ WER, CER, duration, model }: Partial<TestResult>): FormattedTestResult => ({
|
||||
WER: WER ? `${WER * 100}%` : undefined,
|
||||
CER: CER ? `${CER * 100}%` : undefined,
|
||||
duration: duration ? toHumanReadable(duration) : undefined,
|
||||
model
|
||||
})
|
||||
|
||||
describe('Transcribers benchmark', function () {
|
||||
const transcribers = [
|
||||
'openai-whisper',
|
||||
'whisper-ctranslate2',
|
||||
'whisper-timestamped'
|
||||
]
|
||||
|
||||
const transcriptDirectory = buildAbsoluteFixturePath('transcription/benchmark/')
|
||||
const mediaFilePath = buildAbsoluteFixturePath('transcription/videos/communiquer-lors-dune-classe-transplantee.mp4')
|
||||
const referenceTranscriptFile = new TranscriptFile({
|
||||
path: buildAbsoluteFixturePath('transcription/transcript/reference.txt'),
|
||||
language: 'fr',
|
||||
format: 'txt'
|
||||
})
|
||||
|
||||
let benchmark: Record<string, Partial<TestResult>> = {}
|
||||
|
||||
before(async function () {
|
||||
await mkdir(transcriptDirectory, { recursive: true })
|
||||
|
||||
const performanceObserver = new PerformanceObserver((items) => {
|
||||
items
|
||||
.getEntries()
|
||||
.forEach((entry) => {
|
||||
const engineName = transcribers.find(transcriberName => entry.name.includes(transcriberName))
|
||||
|
||||
benchmark = benchmarkReducer(benchmark, engineName, {
|
||||
uuid: entry.name,
|
||||
duration: entry.duration
|
||||
})
|
||||
})
|
||||
})
|
||||
performanceObserver.observe({ type: 'measure' })
|
||||
})
|
||||
|
||||
transcribers.forEach(function (transcriberName) {
|
||||
it(`Run ${transcriberName} transcriber benchmark without issue`, async function () {
|
||||
this.timeout(45000)
|
||||
const transcriber = transcriberFactory.createFromEngineName(
|
||||
transcriberName,
|
||||
createLogger(),
|
||||
transcriptDirectory
|
||||
)
|
||||
const model = { name: 'tiny' }
|
||||
const transcriptFile = await transcriber.transcribe(mediaFilePath, model, 'fr', 'txt')
|
||||
const evaluator = new TranscriptFileEvaluator(referenceTranscriptFile, transcriptFile)
|
||||
await new Promise(resolve => setTimeout(resolve, 1))
|
||||
|
||||
benchmark = benchmarkReducer(benchmark, transcriberName, {
|
||||
engine: transcriber.engine,
|
||||
WER: await evaluator.wer(),
|
||||
CER: await evaluator.cer(),
|
||||
model: model.name
|
||||
})
|
||||
})
|
||||
})
|
||||
|
||||
after(async function () {
|
||||
console.table(
|
||||
Object
|
||||
.keys(benchmark)
|
||||
.reduce((formattedBenchmark, engineName, currentIndex, array) => ({
|
||||
...formattedBenchmark,
|
||||
[engineName]: formatTestResult(benchmark[engineName])
|
||||
}), {})
|
||||
)
|
||||
|
||||
await rm(transcriptDirectory, { recursive: true, force: true })
|
||||
|
||||
performance.clearMarks()
|
||||
})
|
||||
})
|
|
@ -0,0 +1,17 @@
|
|||
import { transcriberFactory } from '@peertube/peertube-transcription'
|
||||
|
||||
describe('Transcriber factory', function () {
|
||||
const transcribers = [
|
||||
'openai-whisper',
|
||||
'whisper-ctranslate2',
|
||||
'whisper-timestamped'
|
||||
]
|
||||
|
||||
describe('Should be able to create a transcriber for each available transcription engine', function () {
|
||||
transcribers.forEach(function (transcriberName) {
|
||||
it(`Should be able to create a(n) ${transcriberName} transcriber`, function () {
|
||||
transcriberFactory.createFromEngineName(transcriberName)
|
||||
})
|
||||
})
|
||||
})
|
||||
})
|
|
@ -0,0 +1,66 @@
|
|||
/* eslint-disable @typescript-eslint/no-unused-expressions, no-new, max-len */
|
||||
import { TranscriptFile, TranscriptFileEvaluator } from '@peertube/peertube-transcription'
|
||||
import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
|
||||
import { join } from 'path'
|
||||
import { mkdir, rm } from 'node:fs/promises'
|
||||
import { expect } from 'chai'
|
||||
|
||||
describe('Transcript File Evaluator', function () {
|
||||
const transcriptDirectory = buildAbsoluteFixturePath('transcription/transcript-evaluator')
|
||||
const referenceTranscriptFilepath = buildAbsoluteFixturePath('transcription/transcript/reference.txt')
|
||||
|
||||
before(async function () {
|
||||
await mkdir(transcriptDirectory, { recursive: true })
|
||||
})
|
||||
|
||||
it(`may not compare files in another format than txt`, async function () {
|
||||
const vttReference = await TranscriptFile.write({
|
||||
path: join(transcriptDirectory, 'reference.vtt'),
|
||||
format: 'vtt',
|
||||
content: ''
|
||||
})
|
||||
const vttHypothesis = await TranscriptFile.write({
|
||||
path: join(transcriptDirectory, 'hypothesis.vtt'),
|
||||
format: 'vtt',
|
||||
content: ''
|
||||
})
|
||||
expect(() => new TranscriptFileEvaluator(vttReference, vttHypothesis)).to.throw('Can only evaluate txt transcript file')
|
||||
})
|
||||
|
||||
it(`evaluation must return coherent wer & cer`, async function () {
|
||||
const reference = new TranscriptFile({
|
||||
path: referenceTranscriptFilepath,
|
||||
language: 'fr',
|
||||
format: 'txt'
|
||||
})
|
||||
const hypothesis = await TranscriptFile.write({
|
||||
path: join(transcriptDirectory, 'openai.txt'),
|
||||
content: `Communiquez lors d'une classe transplante. Utilisez les photos prises lors de cette classe pour raconter quotidiennement le séjour vécu.
|
||||
C'est le scénario P-Dagujic présenté par monsieur Navoli, professeur ainsi que le 3 sur une école alimentaire de Montpellier.
|
||||
La première application a utilisé ce ralame déatec. L'enseignant va alors transférer les différentes photos réalisés lors de la classe transplante.
|
||||
Dans un dossier, spécifique pour que les élèves puissent le retrouver plus facilement. Il téléverse donc ses photos dans le dossier, dans le venté, dans la médiatèque de la classe.
|
||||
Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser.
|
||||
Les élèves par la suite utilisera le blog. A partir de leurs nantes, il pourront se loi de parposte rédigeant un article d'un reinté.
|
||||
Ils illustront ses articles à l'aide des photos de que mon numérique mise à n'accélier dans le venté.
|
||||
Pour se faire, il pourront utiliser les diteurs avancés qui les renvèrent directement dans la médiatèque de la classe où il pourront retrouver le dossier créé par leurs enseignants.
|
||||
Une fois leur article terminée, les élèves soumétront se lui-ci au professeur qui pourra soit la noté pour correction ou le public.
|
||||
Ensuite, il pourront lire et commenter ce de leurs camarades ou répondre aux commentaires de la veille.
|
||||
`,
|
||||
format: 'txt',
|
||||
language: 'fr'
|
||||
})
|
||||
const evaluator = new TranscriptFileEvaluator(reference, hypothesis)
|
||||
const wer = await evaluator.wer()
|
||||
expect(wer).to.be.below(1)
|
||||
expect(wer).to.be.greaterThan(0.3)
|
||||
|
||||
const cer = await evaluator.cer()
|
||||
expect(cer).to.be.below(0.1)
|
||||
expect(cer).to.be.greaterThan(0.09)
|
||||
console.log(await evaluator.alignement())
|
||||
})
|
||||
|
||||
after(async function () {
|
||||
await rm(transcriptDirectory, { recursive: true, force: true })
|
||||
})
|
||||
})
|
|
@ -0,0 +1,26 @@
|
|||
/* eslint-disable @typescript-eslint/no-unused-expressions */
|
||||
import { expect } from 'chai'
|
||||
import { mkdir } from 'node:fs/promises'
|
||||
import { TranscriptFile } from '@peertube/peertube-transcription'
|
||||
import { buildAbsoluteFixturePath } from '@peertube/peertube-node-utils'
|
||||
|
||||
describe('Transcript File', function () {
|
||||
before(async function () {
|
||||
await mkdir(buildAbsoluteFixturePath('transcription/transcript/'), { recursive: true })
|
||||
})
|
||||
|
||||
it(`may creates a new transcript file from scratch`, async function () {
|
||||
const transcript1 = await TranscriptFile.write({
|
||||
path: buildAbsoluteFixturePath('transcription/transcript/test1.txt'),
|
||||
content: 'test2',
|
||||
format: 'txt'
|
||||
})
|
||||
const transcript2 = await TranscriptFile.write({
|
||||
path: buildAbsoluteFixturePath('transcription/transcript/test2.txt'),
|
||||
content: 'test2',
|
||||
format: 'txt'
|
||||
})
|
||||
|
||||
expect(await transcript1.equals(transcript2)).to.be.true
|
||||
})
|
||||
})
|
|
@ -0,0 +1,140 @@
|
|||
/* eslint-disable @typescript-eslint/no-unused-expressions, max-len */
|
||||
import { expect, config } from 'chai'
|
||||
import { createLogger } from 'winston'
|
||||
import { join } from 'path'
|
||||
import { mkdir, rm } from 'node:fs/promises'
|
||||
import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils'
|
||||
import { OpenaiTranscriber, TranscriptFile } from '@peertube/peertube-transcription'
|
||||
|
||||
config.truncateThreshold = 0
|
||||
|
||||
describe('Open AI Whisper transcriber', function () {
|
||||
const transcriptDirectory = join(root(), 'test-transcript')
|
||||
const shortVideoPath = buildAbsoluteFixturePath('video_short.mp4')
|
||||
const frVideoPath = buildAbsoluteFixturePath('transcription/videos/communiquer-lors-dune-classe-transplantee.mp4')
|
||||
|
||||
const transcriber = new OpenaiTranscriber(
|
||||
{
|
||||
name: 'openai-whisper',
|
||||
requirements: [],
|
||||
type: 'binary',
|
||||
binary: 'whisper',
|
||||
supportedModelFormats: [ 'PyTorch' ]
|
||||
},
|
||||
createLogger(),
|
||||
transcriptDirectory
|
||||
)
|
||||
|
||||
before(async function () {
|
||||
await mkdir(transcriptDirectory, { recursive: true })
|
||||
})
|
||||
|
||||
it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () {
|
||||
const transcript = await transcriber.transcribe(shortVideoPath)
|
||||
expect(await transcript.equals(new TranscriptFile({
|
||||
path: join(transcriptDirectory, 'video_short.vtt'),
|
||||
language: 'en',
|
||||
format: 'vtt'
|
||||
}))).to.be.true
|
||||
|
||||
expect(await transcript.read()).to.equals(
|
||||
`WEBVTT
|
||||
|
||||
00:00.000 --> 00:02.000
|
||||
You
|
||||
|
||||
`
|
||||
)
|
||||
})
|
||||
|
||||
it('May produce a transcript file in the `srt` format', async function () {
|
||||
const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'srt')
|
||||
expect(await transcript.equals(new TranscriptFile({
|
||||
path: join(transcriptDirectory, 'video_short.srt'),
|
||||
language: 'en',
|
||||
format: 'srt'
|
||||
}))).to.be.true
|
||||
|
||||
expect(await transcript.read()).to.equal(
|
||||
`1
|
||||
00:00:00,000 --> 00:00:02,000
|
||||
You
|
||||
|
||||
`
|
||||
)
|
||||
})
|
||||
|
||||
it('May produce a transcript file in the `txt` format', async function () {
|
||||
const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'txt')
|
||||
expect(await transcript.equals(new TranscriptFile({
|
||||
path: join(transcriptDirectory, 'video_short.txt'),
|
||||
language: 'en',
|
||||
format: 'txt'
|
||||
}))).to.be.true
|
||||
|
||||
expect(await transcript.read()).to.equal(`You
|
||||
`)
|
||||
})
|
||||
|
||||
it('May transcribe a media file using a local PyTorch model', async function () {
|
||||
await transcriber.transcribe(frVideoPath, { name: 'myLocalModel', path: buildAbsoluteFixturePath('transcription/models/tiny.pt') }, 'fr')
|
||||
})
|
||||
|
||||
it('May transcribe a media file in french', async function () {
|
||||
this.timeout(45000)
|
||||
const transcript = await transcriber.transcribe(frVideoPath, { name: 'tiny' }, 'fr', 'txt')
|
||||
expect(await transcript.equals(new TranscriptFile({
|
||||
path: join(transcriptDirectory, 'communiquer-lors-dune-classe-transplantee.txt'),
|
||||
language: 'fr',
|
||||
format: 'txt'
|
||||
})))
|
||||
|
||||
expect(await transcript.read()).to.equal(
|
||||
`Communiquez lors d'une classe transplante. Utilisez les photos prises lors de cette classe pour raconter quotidiennement le séjour vécu.
|
||||
C'est le scénario P-Dagujic présenté par monsieur Navoli, professeur ainsi que le 3 sur une école alimentaire de Montpellier.
|
||||
La première application a utilisé ce ralame déatec. L'enseignant va alors transférer les différentes photos réalisés lors de la classe transplante.
|
||||
Dans un dossier, spécifique pour que les élèves puissent le retrouver plus facilement. Il téléverse donc ses photos dans le dossier, dans le venté, dans la médiatèque de la classe.
|
||||
Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser.
|
||||
Les élèves par la suite utilisera le blog. A partir de leurs nantes, il pourront se loi de parposte rédigeant un article d'un reinté.
|
||||
Ils illustront ses articles à l'aide des photos de que mon numérique mise à n'accélier dans le venté.
|
||||
Pour se faire, il pourront utiliser les diteurs avancés qui les renvèrent directement dans la médiatèque de la classe où il pourront retrouver le dossier créé par leurs enseignants.
|
||||
Une fois leur article terminée, les élèves soumétront se lui-ci au professeur qui pourra soit la noté pour correction ou le public.
|
||||
Ensuite, il pourront lire et commenter ce de leurs camarades ou répondre aux commentaires de la veille.
|
||||
`
|
||||
)
|
||||
})
|
||||
|
||||
it('May transcribe a media file in french with small model', async function () {
|
||||
this.timeout(400000)
|
||||
const transcript = await transcriber.transcribe(frVideoPath, { name: 'small' }, 'fr', 'txt')
|
||||
expect(await transcript.equals(new TranscriptFile({
|
||||
path: join(transcriptDirectory, 'communiquer-lors-dune-classe-transplantee.txt'),
|
||||
language: 'fr',
|
||||
format: 'txt'
|
||||
}))).to.be.true
|
||||
|
||||
expect(await transcript.read()).to.equal(
|
||||
`Communiquer lors d'une classe transplantée. Utiliser les photos prises lors de cette classe
|
||||
pour raconter quotidiennement le séjour vécu. C'est le scénario pédagogique présenté
|
||||
par M. Navoli, professeur en cycle 3 sur une école élémentaire de Montpellier.
|
||||
La première application à utiliser sera la médiathèque. L'enseignant va alors transférer
|
||||
les différentes photos réalisées lors de la classe transplantée dans un dossier spécifique
|
||||
pour que les élèves puissent le retrouver plus facilement. Ils téléversent donc ces
|
||||
photos dans le dossier, dans le NT, dans la médiathèque de la classe. Pour terminer,
|
||||
ils s'assurent que le dossier soit bien ouvert aux utilisateurs afin que tout le monde
|
||||
puisse l'utiliser. Les élèves, par la suite, utiliseront le blog. A partir de leur note,
|
||||
ils pourront, seul ou à deux par postes, rédiger un article dans leur NT. Ils illustreront
|
||||
ces articles à l'aide des photos et documents numériques mis en accès libre dans le NT.
|
||||
Pour ce faire, ils pourront utiliser l'éditeur avancé qui les renverra directement dans
|
||||
la médiathèque de la classe où ils pourront retrouver le dossier créé par leur enseignant.
|
||||
Une fois leur article terminé, les élèves soulèteront celui-ci au professeur qui pourra
|
||||
soit la noter pour correction ou le publier. Ensuite, ils pourront lire et commenter ceux
|
||||
de leur camarade, ou répondre au commentaire de la veille.
|
||||
`
|
||||
)
|
||||
})
|
||||
|
||||
after(async function () {
|
||||
await rm(transcriptDirectory, { recursive: true, force: true })
|
||||
})
|
||||
})
|
|
@ -0,0 +1,148 @@
|
|||
/* eslint-disable @typescript-eslint/no-unused-expressions, max-len */
|
||||
import { expect, config } from 'chai'
|
||||
import { createLogger } from 'winston'
|
||||
import { join } from 'path'
|
||||
import { mkdir, rm } from 'node:fs/promises'
|
||||
import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils'
|
||||
import { OpenaiTranscriber, WhisperTimestampedTranscriber, TranscriptFile } from '@peertube/peertube-transcription'
|
||||
|
||||
config.truncateThreshold = 0
|
||||
|
||||
describe('Linto timestamped Whisper transcriber', function () {
|
||||
const transcriptDirectory = join(root(), 'test-transcript')
|
||||
const shortVideoPath = buildAbsoluteFixturePath('video_short.mp4')
|
||||
const frVideoPath = buildAbsoluteFixturePath('transcription/videos/communiquer-lors-dune-classe-transplantee.mp4')
|
||||
const transcriber = new WhisperTimestampedTranscriber(
|
||||
{
|
||||
name: 'whisper-timestamped',
|
||||
requirements: [],
|
||||
type: 'binary',
|
||||
binary: 'whisper_timestamped',
|
||||
supportedModelFormats: [ 'PyTorch' ]
|
||||
},
|
||||
createLogger(),
|
||||
transcriptDirectory
|
||||
)
|
||||
|
||||
before(async function () {
|
||||
await mkdir(transcriptDirectory, { recursive: true })
|
||||
})
|
||||
|
||||
it('Should transcribe a media file and produce a transcript file in `vtt` with a ms precision', async function () {
|
||||
const transcript = await transcriber.transcribe(
|
||||
shortVideoPath,
|
||||
{ name: 'tiny' },
|
||||
'fr'
|
||||
)
|
||||
|
||||
expect(await transcript.equals(new TranscriptFile({
|
||||
path: join(transcriptDirectory, 'video_short.vtt'),
|
||||
language: 'fr',
|
||||
format: 'vtt'
|
||||
}))).to.be.true
|
||||
|
||||
expect(await transcript.read()).to.equals(
|
||||
`WEBVTT
|
||||
|
||||
00:02.480 --> 00:02.500
|
||||
you
|
||||
|
||||
`
|
||||
)
|
||||
})
|
||||
|
||||
it('May produce a transcript file in the `srt` format with a ms precision', async function () {
|
||||
const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'srt')
|
||||
expect(await transcript.equals(new TranscriptFile({
|
||||
path: join(transcriptDirectory, 'video_short.srt'),
|
||||
language: 'en',
|
||||
format: 'srt'
|
||||
}))).to.be.true
|
||||
|
||||
expect(await transcript.read()).to.equals(
|
||||
`1
|
||||
00:00:02,480 --> 00:00:02,500
|
||||
you
|
||||
|
||||
`
|
||||
)
|
||||
})
|
||||
|
||||
it('May produce a transcript file in `txt` format', async function () {
|
||||
const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'txt')
|
||||
expect(await transcript.equals(new TranscriptFile({
|
||||
path: join(transcriptDirectory, 'video_short.txt'),
|
||||
language: 'en',
|
||||
format: 'txt'
|
||||
}))).to.be.true
|
||||
|
||||
expect(await transcript.read()).to.equals(`you
|
||||
`)
|
||||
})
|
||||
|
||||
it('May transcribe a media file using a local PyTorch model file', async function () {
|
||||
await transcriber.transcribe(frVideoPath, { name: 'myLocalModel', path: buildAbsoluteFixturePath('transcription/models/tiny.pt') }, 'fr')
|
||||
})
|
||||
|
||||
it('May transcribe a media file in french', async function () {
|
||||
this.timeout(45000)
|
||||
const transcript = await transcriber.transcribe(frVideoPath, { name: 'tiny' }, 'fr', 'txt')
|
||||
expect(await transcript.equals(new TranscriptFile({
|
||||
path: join(transcriptDirectory, 'communiquer-lors-dune-classe-transplantee.txt'),
|
||||
language: 'fr',
|
||||
format: 'txt'
|
||||
}))).to.be.true
|
||||
|
||||
expect(await transcript.read()).to.equal(
|
||||
`...
|
||||
Communiquez lors du ne class et transplanté.
|
||||
Utilisez les photos prises lors de cette classe pour raconter quotidiennement le seuil jour vécu.
|
||||
C'est le scénario P.D. à Goujit présenté par M.I.N.A.Voli,
|
||||
professeur en cycle 3 sur une école émenteur de Montpellier.
|
||||
La première application a utilisé ce ralame de Yatek.
|
||||
L'enseignant va alors transférer les différentes photos réalisés lors de la classe transplantée dans un dossier,
|
||||
spécifique pour que les élèves puissent le retrouver plus facilement.
|
||||
Il t'éleverce donc ses photos dans le dossier, dans le venté, dans la médiatèque de la classe.
|
||||
Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser.
|
||||
Les élèves par la suite utiliseront le blog.
|
||||
À partir de leur note, il pourront se loi de par poste rédigène article dans le reinté.
|
||||
Ils illustront ses articles à l'aide des photos de commun numérique mise à n'accélier dans la même thé.
|
||||
Pour se faire, il pourront utiliser les dites ravences qui les renvèrent directement dans la médiatèque de la classe,
|
||||
où ils pourront retrouver le dossier créé par leur enseignon.
|
||||
Une fois leur article terminée, les élèves soumétront se lui-ci au professeur,
|
||||
qui pourra soit la noter pour correction ou le public.
|
||||
Ensuite, il pourront lire et commenter ce de leur camarade, ou répondre au commentaire de la veille.
|
||||
`
|
||||
)
|
||||
})
|
||||
|
||||
it('Should produce the same transcript text as openai-whisper given the same parameters', async function () {
|
||||
const transcribeParameters: Parameters<typeof transcriber.transcribe> = [
|
||||
shortVideoPath,
|
||||
{ name: 'tiny' },
|
||||
'en',
|
||||
'txt'
|
||||
]
|
||||
|
||||
const transcript = await transcriber.transcribe(...transcribeParameters)
|
||||
|
||||
const openaiTranscriber = new OpenaiTranscriber(
|
||||
{
|
||||
name: 'openai-whisper',
|
||||
requirements: [],
|
||||
type: 'binary',
|
||||
binary: 'whisper',
|
||||
supportedModelFormats: [ 'PyTorch' ]
|
||||
},
|
||||
createLogger(),
|
||||
join(transcriptDirectory, 'openai-whisper')
|
||||
)
|
||||
const openaiTranscript = await openaiTranscriber.transcribe(...transcribeParameters)
|
||||
|
||||
expect(await transcript.read()).to.equals(await openaiTranscript.read())
|
||||
})
|
||||
|
||||
after(async function () {
|
||||
await rm(transcriptDirectory, { recursive: true, force: true })
|
||||
})
|
||||
})
|
|
@ -0,0 +1,137 @@
|
|||
/* eslint-disable @typescript-eslint/no-unused-expressions, max-len */
|
||||
import { expect, config } from 'chai'
|
||||
import { createLogger } from 'winston'
|
||||
import { join } from 'path'
|
||||
import { mkdir, readFile, rm } from 'node:fs/promises'
|
||||
import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils'
|
||||
import { Ctranslate2Transcriber, OpenaiTranscriber, TranscriptFile } from '@peertube/peertube-transcription'
|
||||
|
||||
config.truncateThreshold = 0
|
||||
|
||||
describe('Whisper CTranslate2 transcriber', function () {
|
||||
const transcriptDirectory = join(root(), 'test-transcript')
|
||||
const shortVideoPath = buildAbsoluteFixturePath('video_short.mp4')
|
||||
const frVideoPath = buildAbsoluteFixturePath('transcription/videos/communiquer-lors-dune-classe-transplantee.mp4')
|
||||
const transcriber = new Ctranslate2Transcriber(
|
||||
{
|
||||
name: 'anyNameShouldBeFineReally',
|
||||
requirements: [],
|
||||
type: 'binary',
|
||||
binary: 'whisper-ctranslate2',
|
||||
supportedModelFormats: []
|
||||
},
|
||||
createLogger(),
|
||||
transcriptDirectory
|
||||
)
|
||||
|
||||
before(async function () {
|
||||
await mkdir(transcriptDirectory, { recursive: true })
|
||||
})
|
||||
|
||||
it('Should transcribe a media file and provide a valid path to a transcript file in `vtt` format by default', async function () {
|
||||
const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' })
|
||||
expect(await transcript.equals(new TranscriptFile({ path: join(transcriptDirectory, 'video_short.vtt') }))).to.be.true
|
||||
expect(await readFile(transcript.path, 'utf8')).to.equal(
|
||||
`WEBVTT
|
||||
|
||||
00:00.000 --> 00:02.000
|
||||
You
|
||||
|
||||
`
|
||||
)
|
||||
})
|
||||
|
||||
it('May produce a transcript file in the `srt` format', async function () {
|
||||
const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'srt')
|
||||
expect(await transcript.equals(new TranscriptFile({
|
||||
path: join(transcriptDirectory, 'video_short.srt'),
|
||||
format: 'srt'
|
||||
}))).to.be.true
|
||||
|
||||
expect(await readFile(transcript.path, 'utf8')).to.equal(
|
||||
`1
|
||||
00:00:00,000 --> 00:00:02,000
|
||||
You
|
||||
|
||||
`
|
||||
)
|
||||
})
|
||||
|
||||
it('May produce a transcript file in the `txt` format', async function () {
|
||||
const transcript = await transcriber.transcribe(shortVideoPath, { name: 'tiny' }, 'en', 'txt')
|
||||
expect(await transcript.equals(new TranscriptFile({
|
||||
path: join(transcriptDirectory, 'video_short.txt'),
|
||||
format: 'txt'
|
||||
}))).to.be.true
|
||||
|
||||
expect(await transcript.read()).to.equal(`You
|
||||
`)
|
||||
})
|
||||
|
||||
it('May transcribe a media file using a local CTranslate2 model', async function () {
|
||||
const transcript = await transcriber.transcribe(
|
||||
shortVideoPath,
|
||||
{ name: 'myLocalModel', path: buildAbsoluteFixturePath('transcription/models/faster-whisper-tiny') },
|
||||
'en',
|
||||
'txt'
|
||||
)
|
||||
expect(await transcript.equals(new TranscriptFile({
|
||||
path: join(transcriptDirectory, 'video_short.txt'),
|
||||
format: 'txt'
|
||||
}))).to.be.true
|
||||
|
||||
expect(await transcript.read()).to.equal(`You
|
||||
`)
|
||||
})
|
||||
|
||||
it('May transcribe a media file in french', async function () {
|
||||
this.timeout(45000)
|
||||
const transcript = await transcriber.transcribe(frVideoPath, { name: 'tiny' }, 'fr', 'txt')
|
||||
expect(await transcript.equals(new TranscriptFile({
|
||||
path: join(transcriptDirectory, 'communiquer-lors-dune-classe-transplantee.txt'),
|
||||
language: 'fr',
|
||||
format: 'txt'
|
||||
}))).to.be.true
|
||||
|
||||
expect(await transcript.read()).to.equal(
|
||||
`Communiquez lors d'une classe transplante. Utilisez les photos prises lors de cette classe pour raconter quotidiennement le séjour vécu.
|
||||
C'est le scénario P.Dagujic présenté par Monsieur Navoli, professeur ainsi que le 3 sur une école alimentaire de Montpellier.
|
||||
La première application utilisée sera la médiatique. L'enseignant va alors transférer les différentes photos réalisés lors de la classe transplante.
|
||||
Dans un dossier, spécifique pour que les élèves puissent le retrouver plus facilement. Il téléverse donc ses photos dans le dossier, dans le venté, dans la médiatique de la classe.
|
||||
Pour terminer, il s'assure que le dossier soit bien ouvert aux utilisateurs afin que tout le monde puisse l'utiliser.
|
||||
Les élèves par la suite utiliseront le blog, à partir de leur nante, il pourront se loi de parposte rédigeant un article d'un orienté.
|
||||
Ils illustront ces articles à l'aide des photos de commun numériques mises un accès libre dans leaineté. Pour se faire, il pourront utiliser les détecteurs avancés qui des renvers un directement dans la médiatique de la classe, où il pourront retrouver le dossier créé par leur enseignant.
|
||||
Une fois leur article terminée, les élèves soumettront celui-ci au professeur qui pourra soit la noté pour correction ou le public.
|
||||
Ensuite, il pourront lire et commenter ce de leur camarade, on répondra au commentaire de la veille.
|
||||
`
|
||||
)
|
||||
})
|
||||
|
||||
it('Should produce the same transcript text as openai-whisper given the same parameters', async function () {
|
||||
const transcribeArguments: Parameters<typeof transcriber.transcribe> = [
|
||||
shortVideoPath,
|
||||
{ name: 'tiny' },
|
||||
'en',
|
||||
'txt'
|
||||
]
|
||||
const transcript = await transcriber.transcribe(...transcribeArguments)
|
||||
const openaiTranscriber = new OpenaiTranscriber(
|
||||
{
|
||||
name: 'openai-whisper',
|
||||
requirements: [],
|
||||
type: 'binary',
|
||||
binary: 'whisper',
|
||||
supportedModelFormats: [ 'PyTorch' ]
|
||||
},
|
||||
createLogger(),
|
||||
join(transcriptDirectory, 'openai-whisper')
|
||||
)
|
||||
const openaiTranscript = await openaiTranscriber.transcribe(...transcribeArguments)
|
||||
|
||||
expect(await transcript.equals(openaiTranscript))
|
||||
})
|
||||
|
||||
after(async function () {
|
||||
await rm(transcriptDirectory, { recursive: true, force: true })
|
||||
})
|
||||
})
|
|
@ -6,7 +6,8 @@
|
|||
"tsBuildInfoFile": "./dist/.tsbuildinfo",
|
||||
"paths": {
|
||||
"@tests/*": [ "./src/*" ],
|
||||
"@server/*": [ "../../server/core/*" ]
|
||||
"@server/*": [ "../../server/core/*" ],
|
||||
"@peertube/peertube-transcription": [ "../transcription" ]
|
||||
}
|
||||
},
|
||||
"references": [
|
||||
|
@ -16,6 +17,7 @@
|
|||
{ "path": "../node-utils" },
|
||||
{ "path": "../typescript-utils" },
|
||||
{ "path": "../server-commands" },
|
||||
{ "path": "../transcription" },
|
||||
{ "path": "../../server/tsconfig.lib.json" }
|
||||
],
|
||||
"include": [
|
||||
|
|
|
@ -0,0 +1,92 @@
|
|||
|
||||
DeepLearningFramework vs training libraries
|
||||
|
||||
https://github.com/openai/whisper/blob/main/whisper/__init__.py#L144
|
||||
|
||||
|
||||
```typescript
|
||||
interface DeepLearningFramework {
|
||||
name: string
|
||||
}
|
||||
const deepLearningFrameworks: DeepLearningFramework = [
|
||||
{
|
||||
name: 'PyTorch',
|
||||
distributed: true,
|
||||
gpu: true
|
||||
},
|
||||
{
|
||||
name: 'TensorFlow'
|
||||
}
|
||||
]
|
||||
```
|
||||
|
||||
|
||||
What about the lifecycle of each transcriber ?
|
||||
- install => installer
|
||||
- update => udpater
|
||||
|
||||
For the **Python** packages :
|
||||
1. Install
|
||||
```sh
|
||||
pip install <package-name>
|
||||
```
|
||||
Package version should be constraint to a version compatible with our wrapper.
|
||||
We could also attempt to run our test against different version of the lib to be future ready.
|
||||
|
||||
2. Update
|
||||
```sh
|
||||
pip install -U <package-name>
|
||||
```
|
||||
|
||||
> Need the package name somewhere in the model
|
||||
>
|
||||
>
|
||||
### Whisper timestamped discrepancies
|
||||
- Lower case instead of upper case
|
||||
- missing .json file
|
||||
- binary name is awkard, package is name whisper-timestamped and binary name is whisper-tiomestamped
|
||||
> https://github.com/linto-ai/whisper-timestamped/issues?q=is:issue+author:lutangar
|
||||
|
||||
|
||||
## About models
|
||||
Convert Whisper transformer model from PyTorch to ggml format
|
||||
: e original Whisper PyTorch models provided by OpenAI a ggml format in order to be able to load them in C/C++
|
||||
|
||||
In supervised machine learning, the artefact created after training that is used to make predictions on new data is called a model.
|
||||
models can be saved in a file that can potentially be compressed, so typically model files have a binary file format
|
||||
TensorFlow saves models as protocol buffer files, with a .pb file extension.
|
||||
Keras saves models natively as .h5 file.
|
||||
Scikit-Learn saves models as pickled python objects, with a .pkl file extension.
|
||||
An older format for model serving based on XML, predictive model markup language (.pmml), is still usable on some frameworks, such as Scikit-Learn.
|
||||
|
||||
Training File Formats :
|
||||
- petastorm
|
||||
- npy
|
||||
- tfrecords
|
||||
|
||||
Model Serving Serialization Formats
|
||||
- pb
|
||||
- mlmodel
|
||||
onnx
|
||||
pkl
|
||||
older : h5 pmml
|
||||
|
||||
Hugging Face fine-tuned models to ggml format
|
||||
or Whisper transformer model ?
|
||||
|
||||
ML models vs Transformer Model
|
||||
Transcription Model
|
||||
|
||||
Other model file formats that are used include SparkML models that can be saved in MLeap file format and served in real-time using a MLleap model server (files are packaged in .zip format). Apple developed the .mlmodel file format to store models embedded in iOS applications as part of its Core ML framework (which has superior support for ObjectiveC and Swift languages). Applications trained in TensorFlow, Scikit-Learn, and other frameworks need to convert their model files to the .mlmodel file format for use on iOS, with tools like, coremltools and Tensorflow converter being available to help file format conversion. ONNX is a ML framework independent file format, supported by Microsoft, Facebook, and Amazon. In theory, any ML framework should be able to export its models in .onnx file format, so it offers great promise in unifying model serving across the different frameworks. However, as of late 2019, ONNX does not support all operations for the most popular ML frameworks (TensorFlow, PyTorch, Scikit-Learn), so ONNX is not yet practical for those frameworks. In PyTorch, the recommended way to serve models is to use Torch Script to trace and save a model as a .pt file and serve it from a C++ application.
|
||||
|
||||
One final file format to mention here is YAML that is used to package models as part of the MLFlow framework for ML pipelines on Spark. MLFlow stores a YAML file that describes the files it packages for model serving, so that deployment tools can understand the model file format and know what files to deploy.
|
||||
// ModelServingFileSerializationFormats
|
||||
File formats: .pb, .onnx, .pkl, .mlmodel, .zip, .pmml, .pt
|
||||
Inference: .pb files are served by TensorFlowServing Server;
|
||||
.onnx files are served by Microsoft’s commercial model serving platorm;
|
||||
.pkl files are served for Scikit-Learn models, often on Flask servers;
|
||||
.mlmodel files are served by iOS platforms;
|
||||
.zip files are used to package up MLeap files that are served on the MLeap runtime;
|
||||
.pt files are use to package PyTorch models that can be served inside C++ applications.
|
||||
.'PyTorch' | 'GGML' | 'ONNX' // CoreML, OpenVino, Scikit-Learn, TensorFlow/Keras, PySpark
|
||||
https://towardsdatascience.com/guide-to-file-formats-for-machine-learning-columnar-training-inferencing-and-the-feature-store-2e0c3d18d4f9
|
|
@ -0,0 +1,19 @@
|
|||
{
|
||||
"name": "@peertube/peertube-transcription",
|
||||
"private": true,
|
||||
"version": "0.0.0",
|
||||
"main": "dist/index.js",
|
||||
"files": [ "dist" ],
|
||||
"exports": {
|
||||
"types": "./dist/index.d.ts",
|
||||
"peertube:tsx": "./src/index.ts",
|
||||
"default": "./dist/index.js"
|
||||
},
|
||||
"type": "module",
|
||||
"devDependencies": {},
|
||||
"scripts": {
|
||||
"build": "tsc",
|
||||
"watch": "tsc -w"
|
||||
},
|
||||
"dependencies": {}
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
import { join } from 'node:path'
|
||||
import { existsSync } from 'node:fs'
|
||||
import { PerformanceObserver } from 'node:perf_hooks'
|
||||
import assert from 'node:assert'
|
||||
import { createLogger, Logger } from 'winston'
|
||||
import short from 'short-uuid'
|
||||
import { root } from '@peertube/peertube-node-utils'
|
||||
import { TranscriptionEngine } from './transcription-engine.js'
|
||||
import { TranscriptionModel } from './transcription-model.js'
|
||||
import { TranscriptFile, TranscriptFormat } from './transcript/index.js'
|
||||
|
||||
export abstract class AbstractTranscriber {
|
||||
public static DEFAULT_TRANSCRIPT_DIRECTORY = join(root(), 'dist', 'transcripts')
|
||||
|
||||
engine: TranscriptionEngine
|
||||
logger: Logger
|
||||
transcriptDirectory: string
|
||||
performanceObserver?: PerformanceObserver
|
||||
runId?: string
|
||||
|
||||
constructor (
|
||||
engine: TranscriptionEngine,
|
||||
logger: Logger = createLogger(),
|
||||
transcriptDirectory: string = AbstractTranscriber.DEFAULT_TRANSCRIPT_DIRECTORY,
|
||||
performanceObserver?: PerformanceObserver
|
||||
) {
|
||||
this.engine = engine
|
||||
this.logger = logger
|
||||
this.transcriptDirectory = transcriptDirectory
|
||||
this.performanceObserver = performanceObserver
|
||||
}
|
||||
|
||||
detectLanguage () {
|
||||
return Promise.resolve('')
|
||||
}
|
||||
|
||||
loadModel (model: TranscriptionModel) {
|
||||
if (existsSync(model.path)) { /* empty */ }
|
||||
}
|
||||
|
||||
supports (model: TranscriptionModel) {
|
||||
return model.format === 'PyTorch'
|
||||
}
|
||||
|
||||
createPerformanceMark () {
|
||||
this.runId = `${short.uuid()}-${this.engine.name}`
|
||||
performance.mark(this.getStartPerformanceMarkName())
|
||||
}
|
||||
|
||||
measurePerformanceMark () {
|
||||
try {
|
||||
performance.mark(this.getEndPerformanceMarkName())
|
||||
performance.measure(
|
||||
this.runId,
|
||||
this.getStartPerformanceMarkName(),
|
||||
this.getEndPerformanceMarkName()
|
||||
)
|
||||
} catch (e) {
|
||||
this.logger.log({ level: 'error', message: e })
|
||||
}
|
||||
}
|
||||
|
||||
getStartPerformanceMarkName () {
|
||||
assert(!!this.runId, 'Each transcription run should have an id.')
|
||||
|
||||
return `${this.runId}-started`
|
||||
}
|
||||
|
||||
getEndPerformanceMarkName () {
|
||||
assert(!!this.runId, 'Each transcription run should have an id.')
|
||||
|
||||
return `${this.runId}-ended`
|
||||
}
|
||||
|
||||
abstract transcribe (
|
||||
mediaFilePath: string,
|
||||
model: TranscriptionModel,
|
||||
language: string,
|
||||
format: TranscriptFormat
|
||||
): Promise<TranscriptFile>
|
||||
}
|
|
@ -0,0 +1,23 @@
|
|||
import { expect } from 'chai'
|
||||
import { toHumanReadable, toTimecode } from './duration.js'
|
||||
|
||||
describe('duration conversion functions', () => {
|
||||
it('toHumanReadable', () => {
|
||||
const ONE_MINUTE = 60000
|
||||
let humanDuration = toHumanReadable(ONE_MINUTE)
|
||||
expect(humanDuration).to.equal('1m')
|
||||
|
||||
humanDuration = toHumanReadable(ONE_MINUTE * 60 + ONE_MINUTE)
|
||||
expect(humanDuration).to.equal('1h 1m')
|
||||
})
|
||||
|
||||
it('toTimecode', () => {
|
||||
const MORE_OR_LESS_ONE_MINUTE = '60.41545'
|
||||
let timecode = toTimecode(MORE_OR_LESS_ONE_MINUTE)
|
||||
expect(timecode).to.equal('00:01:00')
|
||||
|
||||
const ONE_HOUR = '3600'
|
||||
timecode = toTimecode(ONE_HOUR)
|
||||
expect(timecode).to.equal('01:00:00')
|
||||
})
|
||||
})
|
|
@ -0,0 +1,35 @@
|
|||
export interface DurationDescriptor {
|
||||
duration: number
|
||||
unit: string
|
||||
}
|
||||
|
||||
export function toHumanReadable (ms: number) {
|
||||
const date = new Date(ms)
|
||||
|
||||
const durationDescriptors: DurationDescriptor[] = [
|
||||
{ duration: date.getUTCHours(), unit: 'h' },
|
||||
{ duration: date.getUTCMinutes(), unit: 'm' },
|
||||
{ duration: date.getUTCSeconds(), unit: 's' }
|
||||
]
|
||||
|
||||
return durationDescriptors
|
||||
.map(toWords)
|
||||
.filter((words) => words)
|
||||
.join(' ')
|
||||
}
|
||||
|
||||
export function toWords ({ duration, unit }: DurationDescriptor) {
|
||||
return duration > 0 ? `${duration}${unit}` : ''
|
||||
}
|
||||
|
||||
export function toTimecode (s: number | string) {
|
||||
const date = new Date(0, 0, 0, 0, 0, parseFloat(s.toString()), 0)
|
||||
const hours = date.getHours()
|
||||
const minutes = date.getMinutes()
|
||||
const seconds = date.getSeconds()
|
||||
return `${padLeft(hours)}:${padLeft(minutes)}:${padLeft(seconds)}`
|
||||
}
|
||||
|
||||
function padLeft (value: number, length = 2): string {
|
||||
return value.toString().padStart(length, '0')
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
import { basename, extname } from 'path'
|
||||
|
||||
export const getFileInfo = (path: string) => {
|
||||
const extension = extname(path)
|
||||
const baseName = basename(path, extension)
|
||||
const name = `${baseName}${extension}`
|
||||
|
||||
return ({
|
||||
extension,
|
||||
baseName,
|
||||
name
|
||||
})
|
||||
}
|
|
@ -0,0 +1,11 @@
|
|||
import { TranscriberFactory } from './transcriber-factory.js'
|
||||
import { engines } from './whisper/index.js'
|
||||
|
||||
export * from './duration.js'
|
||||
|
||||
export * from './transcript/index.js'
|
||||
export * from './transcription-engine.js'
|
||||
export * from './transcription-model.js'
|
||||
export * from './whisper/index.js'
|
||||
|
||||
export const transcriberFactory = new TranscriberFactory(engines)
|
|
@ -0,0 +1,49 @@
|
|||
import { Logger, createLogger } from 'winston'
|
||||
import { TranscriptionEngine } from './transcription-engine.js'
|
||||
import {
|
||||
Ctranslate2Transcriber,
|
||||
OpenaiTranscriber, WhisperTimestampedTranscriber
|
||||
} from './whisper/index.js'
|
||||
import { AbstractTranscriber } from './abstract-transcriber.js'
|
||||
|
||||
export class TranscriberFactory {
|
||||
engines: TranscriptionEngine[]
|
||||
|
||||
constructor (engines: TranscriptionEngine[]) {
|
||||
this.engines = engines
|
||||
}
|
||||
|
||||
createFromEngineName (
|
||||
engineName: string,
|
||||
logger: Logger = createLogger(),
|
||||
transcriptDirectory: string = AbstractTranscriber.DEFAULT_TRANSCRIPT_DIRECTORY
|
||||
) {
|
||||
const engine = this.getEngineByName(engineName)
|
||||
|
||||
const transcriberArgs: ConstructorParameters<typeof AbstractTranscriber> = [
|
||||
engine,
|
||||
logger,
|
||||
transcriptDirectory
|
||||
]
|
||||
|
||||
switch (engineName) {
|
||||
case 'openai-whisper':
|
||||
return new OpenaiTranscriber(...transcriberArgs)
|
||||
case 'whisper-ctranslate2':
|
||||
return new Ctranslate2Transcriber(...transcriberArgs)
|
||||
case 'whisper-timestamped':
|
||||
return new WhisperTimestampedTranscriber(...transcriberArgs)
|
||||
default:
|
||||
throw new Error(`Unimplemented engine ${engineName}`)
|
||||
}
|
||||
}
|
||||
|
||||
getEngineByName (engineName: string) {
|
||||
const engine = this.engines.find(({ name }) => name === engineName)
|
||||
if (!engine) {
|
||||
throw new Error(`Unknow engine ${engineName}`)
|
||||
}
|
||||
|
||||
return engine
|
||||
}
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
export * from './transcript-file.js'
|
||||
export * from './transcript-file-evaluator.js'
|
||||
export * from './transcript-file-interface.js'
|
|
@ -0,0 +1,75 @@
|
|||
import { $ } from 'execa'
|
||||
import assert from 'node:assert'
|
||||
import { TranscriptFile } from './index.js'
|
||||
|
||||
/**
|
||||
* This transcript evaluator is based on Jiwer CLI, a Python implementation :
|
||||
* https://jitsi.github.io/jiwer/cli/
|
||||
*
|
||||
* There are plenty implementation of WER (Word Error Rate) and CER (Character Error Rate) calculation in Python
|
||||
* but not that many in NodeJs.
|
||||
*/
|
||||
export class TranscriptFileEvaluator {
|
||||
referenceTranscriptFile: TranscriptFile
|
||||
hypothesisTranscriptFile: TranscriptFile
|
||||
|
||||
constructor (referenceTranscriptFile: TranscriptFile, hypothesisTranscriptFile: TranscriptFile) {
|
||||
assert(referenceTranscriptFile.format === 'txt', 'Can only evaluate txt transcript file')
|
||||
assert(hypothesisTranscriptFile.format === 'txt', 'Can only evaluate txt transcript file')
|
||||
|
||||
this.referenceTranscriptFile = referenceTranscriptFile
|
||||
this.hypothesisTranscriptFile = hypothesisTranscriptFile
|
||||
}
|
||||
|
||||
static buildArgs (referenceTranscriptFilepath: string, hypothesisTranscriptFilepath: string, ...args: string[]) {
|
||||
return [
|
||||
'--reference',
|
||||
referenceTranscriptFilepath,
|
||||
'--hypothesis',
|
||||
hypothesisTranscriptFilepath,
|
||||
...args
|
||||
]
|
||||
}
|
||||
|
||||
buildArgs (...args: string[]) {
|
||||
return TranscriptFileEvaluator.buildArgs(this.referenceTranscriptFile.path, this.hypothesisTranscriptFile.path, ...args)
|
||||
}
|
||||
|
||||
/**
|
||||
* WER: Word Error Rate
|
||||
*/
|
||||
async wer () {
|
||||
const { stdout: wer } = await $`jiwer ${this.buildArgs('-g')}`
|
||||
|
||||
return Number(wer)
|
||||
}
|
||||
|
||||
/**
|
||||
* CER: Character Error Rate
|
||||
*/
|
||||
async cer () {
|
||||
// @see https://github.com/jitsi/jiwer/issues/87
|
||||
let result = {
|
||||
stdout: undefined
|
||||
}
|
||||
try {
|
||||
result = await $`jiwer ${this.buildArgs('--cer')}`
|
||||
} catch {}
|
||||
|
||||
return result.stdout ? Number(result.stdout) : undefined
|
||||
}
|
||||
|
||||
async alignement () {
|
||||
const { stdout: alignement } = await $`jiwer ${this.buildArgs('--align')}`
|
||||
|
||||
return alignement
|
||||
}
|
||||
|
||||
async evaluate () {
|
||||
return {
|
||||
wer: await this.wer(),
|
||||
cer: await this.cer(),
|
||||
alignement: await this.alignement()
|
||||
}
|
||||
}
|
||||
}
|
|
@ -0,0 +1,3 @@
|
|||
export type TranscriptFormat = 'txt' | 'vtt' | 'srt'
|
||||
|
||||
export type TranscriptFileInterface = { path: string, language?: string, format: TranscriptFormat }
|
|
@ -0,0 +1,50 @@
|
|||
import { statSync } from 'node:fs'
|
||||
import { readFile, writeFile } from 'node:fs/promises'
|
||||
import { TranscriptFileInterface, TranscriptFormat } from './transcript-file-interface.js'
|
||||
import { TranscriptFileEvaluator } from './transcript-file-evaluator.js'
|
||||
|
||||
export class TranscriptFile implements TranscriptFileInterface {
|
||||
path: string
|
||||
language: string = 'en'
|
||||
format: TranscriptFormat = 'vtt'
|
||||
|
||||
constructor ({ path, language = 'en', format = 'vtt' }: { path: string, language?: string, format?: TranscriptFormat }) {
|
||||
statSync(path)
|
||||
|
||||
this.path = path
|
||||
this.language = language
|
||||
this.format = format
|
||||
}
|
||||
|
||||
/**
|
||||
* Asynchronously reads the entire contents of a transcript file.
|
||||
* @see https://nodejs.org/docs/latest-v18.x/api/fs.html#filehandlereadfileoptions for options
|
||||
*/
|
||||
async read (options: Parameters<typeof readFile>[1] = 'utf8') {
|
||||
return await readFile(this.path, options)
|
||||
}
|
||||
|
||||
/**
|
||||
* Write a transcript file to disk.
|
||||
*/
|
||||
static async write ({
|
||||
path,
|
||||
content,
|
||||
language = 'en',
|
||||
format = 'vtt'
|
||||
}: { path: string, content: string, language?: string, format?: TranscriptFormat }): Promise<TranscriptFile> {
|
||||
await writeFile(path, content)
|
||||
|
||||
return new TranscriptFile({ path, language, format })
|
||||
}
|
||||
|
||||
async equals (transcript: TranscriptFile) {
|
||||
return await transcript.read() === await this.read()
|
||||
}
|
||||
|
||||
async evaluate (transcript: TranscriptFile) {
|
||||
const evaluator = new TranscriptFileEvaluator(this, transcript)
|
||||
|
||||
return evaluator.evaluate()
|
||||
}
|
||||
}
|
|
@ -0,0 +1,22 @@
|
|||
import { ModelFormat } from './transcription-model.js'
|
||||
|
||||
/**
|
||||
* The engine, or framework.
|
||||
*/
|
||||
export class TranscriptionEngine {
|
||||
name: string
|
||||
description?: string
|
||||
language?: string
|
||||
requirements: string[]
|
||||
type: 'binary' | 'bindings' | 'ws'
|
||||
binary: string
|
||||
license?: string
|
||||
forgeURL?: string
|
||||
supportedModelFormats: ModelFormat[]
|
||||
// There could be a default models.
|
||||
// There could be a list of default models
|
||||
|
||||
constructor (parameters: TranscriptionEngine) {
|
||||
Object.assign(this, parameters)
|
||||
}
|
||||
}
|
|
@ -0,0 +1,19 @@
|
|||
export type ModelFormat = 'PyTorch' | 'GGML' | 'ONNX' | 'CTranslate2' // CoreML, OpenVino, Scikit-Learn, TensorFlow/Keras, PySpark
|
||||
|
||||
export abstract class TranscriptionModel {
|
||||
name: string
|
||||
format?: ModelFormat
|
||||
path?: string
|
||||
url?: string
|
||||
|
||||
// # - hparams
|
||||
// # - Number of dimensions (int)
|
||||
// # - Name length (int)
|
||||
// # - Dimensions (int[n_dims])
|
||||
// # - Name (char[name_length])
|
||||
// # - Data (float[n_dims])
|
||||
|
||||
// # - mel filters
|
||||
// # - tokenizer vocab
|
||||
// # - model variables
|
||||
}
|
|
@ -0,0 +1,81 @@
|
|||
- cpp
|
||||
- ctranslate2
|
||||
- faster-whisper
|
||||
- insanely-fast-whisper
|
||||
- whisper
|
||||
- transformers.js
|
||||
- whisperX
|
||||
|
||||
Transformers* could be defined as an all-purpose inference engines instead of a whisper only engine :
|
||||
- to create a video summary
|
||||
-
|
||||
|
||||
|
||||
|
||||
// mixed precision training
|
||||
// env.cacheDir = './.cache';
|
||||
// env.localModelPath = '/path/to/models/';
|
||||
// env.allowRemoteModels = false;
|
||||
// To optimize the data pipeline, you should use techniques such as
|
||||
// caching,
|
||||
// prefetching,
|
||||
// batching,
|
||||
// sharding, and
|
||||
// compression, depending on the characteristics and size of your data.
|
||||
// You should also monitor the data throughput and utilization of the GPU and CPU devices, and adjust the data pipeline accordingly.
|
||||
// 1) Prefetching: To load data asynchronously while the model is training on the current batch. This minimizes data loading bottlenecks.
|
||||
// 2) Data Sampling for initial models: For initial model development or debugging, working with a smaller subset of your data to can help speedy setup and output.
|
||||
// 3) Parallel Processing: This is the most obvious point and important point. Utilize multi-threading or multiprocessing libraries like concurrent.futures in Python to preprocess data in parallel. This is particularly effective when dealing with large datasets.
|
||||
// https://www.linkedin.com/advice/3/how-can-you-optimize-machine-learning-models
|
||||
// Use mixed precision training
|
||||
// Apply model pruning and quantization
|
||||
// Sizing the model will almost always help with performance,
|
||||
// On GPUs,
|
||||
// - leverage batch processing
|
||||
// - and mixed-precision training,
|
||||
// - manage GPU memory,
|
||||
// - and consider model pruning.
|
||||
// For CPUs,
|
||||
// - utilize multi-threading,
|
||||
// - efficient libraries,
|
||||
// - batch inference, quantization,
|
||||
// - and model optimization.
|
||||
// - Employ
|
||||
// - compiler flags,
|
||||
// - caching,
|
||||
// - and distributed computing for CPU performance.
|
||||
// Profiling tools help identify bottlenecks on both hardware types, ensuring efficient model deployment in diverse environments.
|
||||
// The choice between GPU and CPU optimization depends on the specific task and hardware resources available.
|
||||
// Cela pourrait être chouette de pouvoir run des tests sur des runners gpu depuis Github Actions :
|
||||
// https://resources.github.com/devops/accelerate-your-cicd-with-arm-and-gpu-runners-in-github-actions/
|
||||
|
||||
// Techniques such as
|
||||
// model quantization, pruning,
|
||||
// and other optimizations can further enhance the efficiency of running these models on CPU hardware.
|
||||
// If you're looking to deploy Whisper models on CPU-based systems, you can use popular deep learning frameworks like TensorFlow or PyTorch, which provide support for deploying models on CPU and offer optimizations for inference performance. Additionally, platforms like ONNX Runtime or TensorFlow Lite offer optimizations for inference on CPU, including support for quantized models and hardware acceleration where available.
|
||||
|
||||
// https://eval.ai/web/challenges/challenge-page/1637/overview
|
||||
// https://github.com/fquirin/speech-recognition-experiments
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
// => are producting models
|
||||
|
||||
|
||||
// PyTorch and TensorFlow
|
||||
// deepLearningFramework
|
||||
// cpp.ts
|
||||
// ctranslate2.ts
|
||||
// faster.ts
|
||||
// insanely-fast.ts
|
||||
// python.ts
|
||||
// transformer.ts
|
||||
// X .ts
|
||||
|
||||
// whisper.cpp
|
||||
// ggml
|
|
@ -0,0 +1,58 @@
|
|||
import { TranscriptionEngine } from '../transcription-engine.js'
|
||||
|
||||
export const engines: TranscriptionEngine[] = [
|
||||
{
|
||||
name : 'whisper-cpp',
|
||||
description : 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model',
|
||||
type: 'binary',
|
||||
binary: 'main',
|
||||
language : 'cpp',
|
||||
requirements : [],
|
||||
forgeURL : 'https://github.com/ggerganov/whisper.cpp',
|
||||
license : 'MIT',
|
||||
supportedModelFormats: [ 'ONNX' ]
|
||||
},
|
||||
// {
|
||||
// name : 'transformers',
|
||||
// description : 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model',
|
||||
// type: 'binary',
|
||||
// language : 'python',
|
||||
// requirements : [],
|
||||
// forgeURL : '',
|
||||
// license : '',
|
||||
// supportedModelFormats: [ 'ONNX' ]
|
||||
// },
|
||||
{
|
||||
name: 'openai-whisper',
|
||||
description: 'High-performance inference of OpenAI\'s Whisper automatic speech recognition model',
|
||||
requirements: [ 'python', 'pyTorch', 'ffmpeg' ],
|
||||
language: 'python',
|
||||
type: 'binary',
|
||||
binary: 'whisper',
|
||||
forgeURL: 'https://github.com/openai/whisper',
|
||||
license: 'MIT',
|
||||
supportedModelFormats: [ 'PyTorch' ]
|
||||
},
|
||||
{
|
||||
name: 'whisper-ctranslate2',
|
||||
description: '',
|
||||
requirements: [ 'python' ],
|
||||
language: 'python',
|
||||
type: 'binary',
|
||||
binary: 'whisper-ctranslate2',
|
||||
forgeURL: 'https://github.com/openai/whisper',
|
||||
license: 'MIT',
|
||||
supportedModelFormats: [ 'CTranslate2' ]
|
||||
},
|
||||
{
|
||||
name: 'whisper-timestamped',
|
||||
description: '',
|
||||
requirements: [ 'python' ],
|
||||
language: 'python',
|
||||
type: 'binary',
|
||||
binary: 'whisper_timestamped',
|
||||
forgeURL: 'https://github.com/openai/whisper',
|
||||
license: 'MIT',
|
||||
supportedModelFormats: [ 'CTranslate2' ]
|
||||
}
|
||||
]
|
|
@ -0,0 +1,2 @@
|
|||
export * from './transcriber/index.js'
|
||||
export * from './engines.js'
|
|
@ -0,0 +1,48 @@
|
|||
import { $ } from 'execa'
|
||||
import { join } from 'path'
|
||||
import { lstat } from 'node:fs/promises'
|
||||
import { OpenaiTranscriber } from './openai-transcriber.js'
|
||||
import { TranscriptionModel } from '../../transcription-model.js'
|
||||
import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js'
|
||||
import { getFileInfo } from '../../file-utils.js'
|
||||
|
||||
export class Ctranslate2Transcriber extends OpenaiTranscriber {
|
||||
public static readonly MODEL_FILENAME = 'model.bin'
|
||||
|
||||
async transcribe (
|
||||
mediaFilePath: string,
|
||||
model: TranscriptionModel = { name: 'tiny' },
|
||||
language: string = 'en',
|
||||
format: TranscriptFormat = 'vtt'
|
||||
): Promise<TranscriptFile> {
|
||||
this.createPerformanceMark()
|
||||
// Shall we run the command with `{ shell: true }` to get the same error as in sh ?
|
||||
// ex: ENOENT => Command not found
|
||||
const $$ = $({ verbose: true })
|
||||
const { baseName } = getFileInfo(mediaFilePath)
|
||||
|
||||
if (model.path) {
|
||||
await lstat(model.path).then(stats => stats.isDirectory())
|
||||
}
|
||||
const modelArgs = model.path ? [ '--model_directory', model.path ] : [ '--model', model.name ]
|
||||
|
||||
await $$`${this.engine.binary} ${[
|
||||
mediaFilePath,
|
||||
...modelArgs,
|
||||
'--output_format',
|
||||
format,
|
||||
'--output_dir',
|
||||
this.transcriptDirectory,
|
||||
'--language',
|
||||
language
|
||||
]}`
|
||||
|
||||
this.measurePerformanceMark()
|
||||
|
||||
return new TranscriptFile({
|
||||
language,
|
||||
path: join(this.transcriptDirectory, `${baseName}.${format}`),
|
||||
format
|
||||
})
|
||||
}
|
||||
}
|
|
@ -0,0 +1,5 @@
|
|||
export * from './ctranslate2-transcriber.js'
|
||||
export * from './transformers-js-transcriber.js'
|
||||
export * from './transformers-transcriber.js'
|
||||
export * from './openai-transcriber.js'
|
||||
export * from './timestamped-transcriber.js'
|
|
@ -0,0 +1,41 @@
|
|||
import { join } from 'path'
|
||||
import { $ } from 'execa'
|
||||
import { TranscriptionModel } from '../../transcription-model.js'
|
||||
import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js'
|
||||
import { AbstractTranscriber } from '../../abstract-transcriber.js'
|
||||
import { getFileInfo } from '../../file-utils.js'
|
||||
|
||||
export class OpenaiTranscriber extends AbstractTranscriber {
|
||||
async transcribe (
|
||||
mediaFilePath: string,
|
||||
model: TranscriptionModel = { name: 'tiny' },
|
||||
language: string = 'en',
|
||||
format: TranscriptFormat = 'vtt'
|
||||
): Promise<TranscriptFile> {
|
||||
this.createPerformanceMark()
|
||||
// Shall we run the command with `{ shell: true }` to get the same error as in sh ?
|
||||
// ex: ENOENT => Command not found
|
||||
const $$ = $({ verbose: true })
|
||||
const { baseName } = getFileInfo(mediaFilePath)
|
||||
|
||||
await $$`${this.engine.binary} ${[
|
||||
mediaFilePath,
|
||||
'--model',
|
||||
model?.path || model.name,
|
||||
'--output_format',
|
||||
format,
|
||||
'--output_dir',
|
||||
this.transcriptDirectory,
|
||||
'--language',
|
||||
language
|
||||
]}`
|
||||
|
||||
this.measurePerformanceMark()
|
||||
|
||||
return new TranscriptFile({
|
||||
language,
|
||||
path: join(this.transcriptDirectory, `${baseName}.${format}`),
|
||||
format
|
||||
})
|
||||
}
|
||||
}
|
|
@ -0,0 +1,46 @@
|
|||
import { $ } from 'execa'
|
||||
import assert from 'node:assert'
|
||||
import { join } from 'node:path'
|
||||
import { existsSync } from 'node:fs'
|
||||
import { rename } from 'node:fs/promises'
|
||||
import { TranscriptionModel } from '../../transcription-model.js'
|
||||
import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js'
|
||||
import { getFileInfo } from '../../file-utils.js'
|
||||
import { OpenaiTranscriber } from './openai-transcriber.js'
|
||||
|
||||
export class WhisperTimestampedTranscriber extends OpenaiTranscriber {
|
||||
async transcribe (
|
||||
mediaFilePath: string,
|
||||
model: TranscriptionModel,
|
||||
language: string,
|
||||
format: TranscriptFormat = 'vtt'
|
||||
): Promise<TranscriptFile> {
|
||||
this.createPerformanceMark()
|
||||
|
||||
const $$ = $({ verbose: true })
|
||||
const { baseName, name } = getFileInfo(mediaFilePath)
|
||||
await $$`${this.engine.binary} ${[
|
||||
mediaFilePath,
|
||||
'--model',
|
||||
model?.path || model.name,
|
||||
'--output_format',
|
||||
'all',
|
||||
'--output_dir',
|
||||
this.transcriptDirectory
|
||||
]}`
|
||||
|
||||
const internalTranscriptPath = join(this.transcriptDirectory, `${name}.${format}`)
|
||||
const transcriptPath = join(this.transcriptDirectory, `${baseName}.${format}`)
|
||||
// Whisper timestamped is supposed to output file with the video file extension ex: video.mp4.vtt
|
||||
assert(existsSync(internalTranscriptPath), `${internalTranscriptPath} file doesn't exist.`)
|
||||
await rename(internalTranscriptPath, transcriptPath)
|
||||
|
||||
this.measurePerformanceMark()
|
||||
|
||||
return new TranscriptFile({
|
||||
language,
|
||||
path: transcriptPath,
|
||||
format
|
||||
})
|
||||
}
|
||||
}
|
|
@ -0,0 +1,21 @@
|
|||
import { TranscriptionModel } from '../../transcription-model.js'
|
||||
import { AbstractTranscriber } from '../../abstract-transcriber.js'
|
||||
import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js'
|
||||
|
||||
// Disable local models
|
||||
// env.allowLocalModels = true
|
||||
|
||||
export class TransformersJsTranscriber extends AbstractTranscriber {
|
||||
async transcribe (
|
||||
mediaFilePath: string,
|
||||
model: TranscriptionModel,
|
||||
language: string,
|
||||
format: TranscriptFormat = 'vtt'
|
||||
): Promise<TranscriptFile> {
|
||||
return Promise.resolve(undefined)
|
||||
// return pipeline('automatic-speech-recognition', 'no_attentions', {
|
||||
// // For medium models, we need to load the `no_attentions` revision to avoid running out of memory
|
||||
// revision: [].includes('/whisper-medium') ? 'no_attentions' : 'main'
|
||||
// })
|
||||
}
|
||||
}
|
|
@ -0,0 +1,43 @@
|
|||
import { TranscriptionModel } from '../../transcription-model.js'
|
||||
import { TranscriptFile, TranscriptFormat } from '../../transcript/index.js'
|
||||
import { AbstractTranscriber } from '../../abstract-transcriber.js'
|
||||
import { $ } from 'execa'
|
||||
import { join } from 'path'
|
||||
|
||||
export class TransformersTranscriber extends AbstractTranscriber {
|
||||
async transcribe (
|
||||
mediaFilePath: string,
|
||||
model: TranscriptionModel,
|
||||
language: string,
|
||||
format: TranscriptFormat = 'vtt'
|
||||
): Promise<TranscriptFile> {
|
||||
const $$ = $({ verbose: true })
|
||||
// const ffmpegChildProcess = $$`ffmpeg ${[
|
||||
// '-i',
|
||||
// mediaFilePath,
|
||||
// '-vn', // no video
|
||||
// '-ar',
|
||||
// 16000, // set the audio sampling frequency
|
||||
// '-ac',
|
||||
// '1', // set the number of audio channels to 1 since Vosk is expecting mono
|
||||
// '-bufsize',
|
||||
// 1000, // set a buffer size to provide a steady flow of frames
|
||||
// '-'
|
||||
// ]}`
|
||||
|
||||
await $$`transformers-cli ${[
|
||||
'--task',
|
||||
'automatic-speech-recognition',
|
||||
'--model',
|
||||
'openai/whisper-tiny',
|
||||
'--input',
|
||||
mediaFilePath
|
||||
]}`
|
||||
|
||||
return new TranscriptFile({
|
||||
language,
|
||||
path: join(this.transcriptDirectory, `test.${format}`),
|
||||
format
|
||||
})
|
||||
}
|
||||
}
|
|
@ -0,0 +1,13 @@
|
|||
{
|
||||
"extends": "../../tsconfig.base.json",
|
||||
"compilerOptions": {
|
||||
"outDir": "./dist",
|
||||
"rootDir": "src",
|
||||
"tsBuildInfoFile": "./dist/.tsbuildinfo"
|
||||
},
|
||||
"references": [
|
||||
{ "path": "../models" },
|
||||
{ "path": "../core-utils" },
|
||||
{ "path": "../node-utils" }
|
||||
]
|
||||
}
|
|
@ -0,0 +1,10 @@
|
|||
{
|
||||
"extends": "./tsconfig.json",
|
||||
"compilerOptions": {
|
||||
"outDir": "../types-generator/dist/peertube-transcription",
|
||||
"tsBuildInfoFile": "../types-generator/dist/peertube-transcription/.tsbuildinfo",
|
||||
"stripInternal": true,
|
||||
"removeComments": false,
|
||||
"emitDeclarationOnly": true
|
||||
}
|
||||
}
|
|
@ -14,6 +14,7 @@
|
|||
{ "path": "../packages/ffmpeg" },
|
||||
{ "path": "../packages/models" },
|
||||
{ "path": "../packages/node-utils" },
|
||||
{ "path": "../packages/transcription" },
|
||||
{ "path": "../packages/typescript-utils" }
|
||||
],
|
||||
"include": [
|
||||
|
|
|
@ -27,6 +27,7 @@
|
|||
{ "path": "./packages/models" },
|
||||
{ "path": "./packages/node-utils" },
|
||||
{ "path": "./packages/server-commands" },
|
||||
{ "path": "./packages/transcription" },
|
||||
{ "path": "./packages/typescript-utils" }
|
||||
]
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue