chore: add performance markers

pull/6303/head
lutangar 2024-04-19 18:05:17 +02:00
parent 2e242129b9
commit 5d1d0ab565
8 changed files with 125 additions and 9 deletions

View File

@ -4,7 +4,8 @@ import { expect } from 'chai'
import { existsSync } from 'node:fs'
import { rm, mkdir, readFile } from 'node:fs/promises'
import { buildAbsoluteFixturePath, root } from '@peertube/peertube-node-utils'
import { transcriberFactory } from '@peertube/peertube-transcription'
import { toHumanReadable, transcriberFactory } from '@peertube/peertube-transcription'
import { performance, PerformanceObserver } from 'node:perf_hooks'
describe('Transcribers', function () {
const transcriptDirectory = join(root(), 'test-transcript')
@ -17,6 +18,13 @@ describe('Transcribers', function () {
before(async function () {
await mkdir(transcriptDirectory, { recursive: true })
const performanceObserver = new PerformanceObserver((items) => {
items
.getEntries()
.forEach((entry) => console.log(`Transcription ${entry.name} took ${toHumanReadable(entry.duration)}`))
})
performanceObserver.observe({ type: 'measure' })
})
transcribers.forEach(function (transcriberName) {
@ -55,5 +63,6 @@ describe('Transcribers', function () {
after(async function () {
await rm(transcriptDirectory, { recursive: true, force: true })
performance.clearMarks()
})
})

View File

@ -5,6 +5,9 @@ import { TranscriptionEngine } from './transcription-engine.js'
import { TranscriptionModel } from './transcription-model.js'
import { Transcript, TranscriptFormat } from './transcript.js'
import { existsSync } from 'fs'
import { PerformanceObserver } from 'node:perf_hooks'
import short from 'short-uuid'
import assert from 'node:assert'
export abstract class AbstractTranscriber {
public static DEFAULT_TRANSCRIPT_DIRECTORY = join(root(), 'dist', 'transcripts')
@ -12,15 +15,19 @@ export abstract class AbstractTranscriber {
engine: TranscriptionEngine
logger: Logger
transcriptDirectory: string
performanceObserver?: PerformanceObserver
runId?: string
constructor (
engine: TranscriptionEngine,
logger: Logger,
transcriptDirectory: string = AbstractTranscriber.DEFAULT_TRANSCRIPT_DIRECTORY
transcriptDirectory: string = AbstractTranscriber.DEFAULT_TRANSCRIPT_DIRECTORY,
performanceObserver?: PerformanceObserver
) {
this.engine = engine
this.logger = logger
this.transcriptDirectory = transcriptDirectory
this.performanceObserver = performanceObserver
}
detectLanguage () {
@ -35,5 +42,44 @@ export abstract class AbstractTranscriber {
return model.format === 'PyTorch'
}
createPerformanceMark () {
this.runId = `${short.uuid()}-${this.engine.name}`
performance.mark(this.getStartPerformanceMarkName())
}
measurePerformanceMark () {
try {
performance.mark(this.getEndPerformanceMarkName())
performance.measure(
this.runId,
this.getStartPerformanceMarkName(),
this.getEndPerformanceMarkName()
)
} catch (e) {
this.logger.log({ level: 'error', message: e })
}
}
getStartPerformanceMarkName () {
assert(!!this.runId, 'Each transcription run should have an id.')
return `${this.runId}-started`
}
getEndPerformanceMarkName () {
assert(!!this.runId, 'Each transcription run should have an id.')
return `${this.runId}-ended`
}
perf () {
// const transcriptionPerformanceObserver = new PerformanceObserver((items) => {
// items
// .getEntries()
// .forEach((entry) => logger.debug(`Transcription n°${entry.name} took ${toHumanReadable(entry.duration)}`, entry))
// performance.clearMarks()
// })
}
abstract transcribe (mediaFilePath: string, model: TranscriptionModel, language: string, format: TranscriptFormat): Promise<Transcript>
}

View File

@ -0,0 +1,22 @@
import { toHumanReadable, toTimecode } from './duration';
describe('duration', () => {
test('toHumanReadable', async () => {
const ONE_MINUTE = 60000;
let humanDuration = toHumanReadable(ONE_MINUTE);
expect(humanDuration).toEqual('1m');
humanDuration = toHumanReadable(ONE_MINUTE * 60 + ONE_MINUTE);
expect(humanDuration).toEqual('1h 1m');
});
test('toTimecode', async () => {
const MORE_OR_LESS_ONE_MINUTE = '60.41545';
let timecode = toTimecode(MORE_OR_LESS_ONE_MINUTE);
expect(timecode).toEqual('00:01:00');
const ONE_HOUR = '3600';
timecode = toTimecode(ONE_HOUR);
expect(timecode).toEqual('01:00:00');
});
});

View File

@ -0,0 +1,35 @@
export interface DurationDescriptor {
duration: number
unit: string
}
export function toHumanReadable (ms: number) {
const date = new Date(ms)
const durationDescriptors: DurationDescriptor[] = [
{ duration: date.getUTCHours(), unit: 'h' },
{ duration: date.getUTCMinutes(), unit: 'm' },
{ duration: date.getUTCSeconds(), unit: 's' }
]
return durationDescriptors
.map(toWords)
.filter((words) => words)
.join(' ')
}
export function toWords ({ duration, unit }: DurationDescriptor) {
return duration > 0 ? `${duration}${unit}` : ''
}
export function toTimecode (s: number | string) {
const date = new Date(0, 0, 0, 0, 0, parseFloat(s.toString()), 0)
const hours = date.getHours()
const minutes = date.getMinutes()
const seconds = date.getSeconds()
return `${padLeft(hours)}:${padLeft(minutes)}:${padLeft(seconds)}`
}
function padLeft (value: number, length = 2): string {
return value.toString().padStart(length, '0')
}

View File

@ -1,6 +1,8 @@
import { TranscriberFactory } from './transcriber-factory.js'
import { engines } from './whisper/index.js'
export * from './duration.js'
export * from './transcription-engine.js'
export * from './transcription-model.js'
export * from './transcript.js'

View File

@ -12,6 +12,7 @@ export class Ctranslate2Transcriber extends AbstractTranscriber {
language: string,
format: TranscriptFormat = 'vtt'
): Promise<Transcript> {
this.createPerformanceMark()
const $$ = $({ verbose: true })
const { baseName } = getFileInfo(mediaFilePath)
@ -25,7 +26,7 @@ export class Ctranslate2Transcriber extends AbstractTranscriber {
this.transcriptDirectory
]}`
await $$`ls ${this.transcriptDirectory}`
this.measurePerformanceMark()
return {
language,

View File

@ -12,12 +12,13 @@ export class OpenaiTranscriber extends AbstractTranscriber {
language: string,
format: TranscriptFormat = 'vtt'
): Promise<Transcript> {
this.createPerformanceMark()
// Shall we run the command with `{ shell: true }` to get the same error as in sh ?
// ex: ENOENT => Command not found
const $$ = $({ verbose: true })
const { baseName } = getFileInfo(mediaFilePath)
const { stdout } = await $$`whisper ${[
await $$`whisper ${[
mediaFilePath,
'--model',
model.name,
@ -26,10 +27,8 @@ export class OpenaiTranscriber extends AbstractTranscriber {
'--output_dir',
this.transcriptDirectory
]}`
console.log(stdout)
const { stdout: lsStdout } = await $$`ls ${this.transcriptDirectory}`
console.log(lsStdout)
this.measurePerformanceMark()
return {
language,

View File

@ -15,6 +15,8 @@ export class WhisperTimestampedTranscriber extends AbstractTranscriber {
language: string,
format: TranscriptFormat = 'vtt'
): Promise<Transcript> {
this.createPerformanceMark()
const $$ = $({ verbose: true })
const { baseName, name } = getFileInfo(mediaFilePath)
await $$`whisper_timestamped ${[
@ -30,9 +32,9 @@ export class WhisperTimestampedTranscriber extends AbstractTranscriber {
const internalTranscriptPath = join(this.transcriptDirectory, `${name}.${format}`)
const transcriptPath = join(this.transcriptDirectory, `${baseName}.${format}`)
assert(existsSync(internalTranscriptPath), '')
await rename(internalTranscriptPath, transcriptPath)
await $$`ls ${this.transcriptDirectory}`
this.measurePerformanceMark()
return {
language,