PeerTube/server/core/lib/video-captions.ts

import { VideoFileStream } from '@peertube/peertube-models'
import { buildSUUID } from '@peertube/peertube-node-utils'
import { AbstractTranscriber, TranscriptionModel, WhisperBuiltinModel, transcriberFactory } from '@peertube/peertube-transcription'
import { moveAndProcessCaptionFile } from '@server/helpers/captions-utils.js'
import { isVideoCaptionLanguageValid } from '@server/helpers/custom-validators/video-captions.js'
import { logger, loggerTagsFactory } from '@server/helpers/logger.js'
import { CONFIG } from '@server/initializers/config.js'
import { DIRECTORIES } from '@server/initializers/constants.js'
import { sequelizeTypescript } from '@server/initializers/database.js'
import { VideoCaptionModel } from '@server/models/video/video-caption.js'
import { VideoJobInfoModel } from '@server/models/video/video-job-info.js'
import { VideoModel } from '@server/models/video/video.js'
import { MVideo, MVideoCaption, MVideoFullLight, MVideoUUID, MVideoUrl } from '@server/types/models/index.js'
import { MutexInterface } from 'async-mutex'
import { ensureDir, remove } from 'fs-extra/esm'
import { join } from 'path'
import { federateVideoIfNeeded } from './activitypub/videos/federate.js'
import { JobQueue } from './job-queue/job-queue.js'
import { Notifier } from './notifier/notifier.js'
import { TranscriptionJobHandler } from './runners/index.js'
import { VideoPathManager } from './video-path-manager.js'
import { retryTransactionWrapper } from '@server/helpers/database-utils.js'

const lTags = loggerTagsFactory('video-caption')

export async function createLocalCaption (options: {
  video: MVideo
  path: string
  language: string
  automaticallyGenerated: boolean
}) {
  const { language, path, video, automaticallyGenerated } = options

  const videoCaption = new VideoCaptionModel({
    videoId: video.id,
    filename: VideoCaptionModel.generateCaptionName(language),
    language,
    automaticallyGenerated
  }) as MVideoCaption

  await moveAndProcessCaptionFile({ path }, videoCaption)

  await retryTransactionWrapper(() => {
    return sequelizeTypescript.transaction(t => {
      return VideoCaptionModel.insertOrReplaceLanguage(videoCaption, t)
    })
  })

  return Object.assign(videoCaption, { Video: video })
}

export async function createTranscriptionTaskIfNeeded (video: MVideoUUID & MVideoUrl) {
  if (CONFIG.VIDEO_TRANSCRIPTION.ENABLED !== true) return

  logger.info(`Creating transcription job for ${video.url}`, lTags(video.uuid))

  if (CONFIG.VIDEO_TRANSCRIPTION.REMOTE_RUNNERS.ENABLED === true) {
    await new TranscriptionJobHandler().create({ video })
  } else {
    await JobQueue.Instance.createJob({ type: 'video-transcription', payload: { videoUUID: video.uuid } })
  }

  await VideoJobInfoModel.increaseOrCreate(video.uuid, 'pendingTranscription')
}

// ---------------------------------------------------------------------------
// Transcription task
// ---------------------------------------------------------------------------

let transcriber: AbstractTranscriber

export async function generateSubtitle (options: {
  video: MVideoUUID
}) {
  const outputPath = join(CONFIG.STORAGE.TMP_DIR, 'transcription', buildSUUID())

  let inputFileMutexReleaser: MutexInterface.Releaser

  try {
    await ensureDir(outputPath)

    const binDirectory = join(DIRECTORIES.LOCAL_PIP_DIRECTORY, 'bin')

    // Lazy load the transcriber
    if (!transcriber) {
      transcriber = transcriberFactory.createFromEngineName({
        engineName: CONFIG.VIDEO_TRANSCRIPTION.ENGINE,
        enginePath: CONFIG.VIDEO_TRANSCRIPTION.ENGINE_PATH,
        logger,
        binDirectory
      })

      if (!CONFIG.VIDEO_TRANSCRIPTION.ENGINE_PATH) {
        logger.info(`Installing transcriber ${transcriber.engine.name} to generate subtitles`, lTags())
        await transcriber.install(DIRECTORIES.LOCAL_PIP_DIRECTORY)
      }
    }

    inputFileMutexReleaser = await VideoPathManager.Instance.lockFiles(options.video.uuid)

    const video = await VideoModel.loadFull(options.video.uuid)
    if (!video) {
      logger.info('Do not process transcription, video does not exist anymore.', lTags(options.video.uuid))
      return undefined
    }

    const file = video.getMaxQualityFile(VideoFileStream.AUDIO)

    if (!file) {
      logger.info(
        `Do not run transcription for ${video.uuid} in ${outputPath} because it does not contain an audio stream`,
        { video, ...lTags(video.uuid) }
      )

      return
    }

    await VideoPathManager.Instance.makeAvailableVideoFile(file, async inputPath => {
      // Release input file mutex now we are going to run the command
      setTimeout(() => inputFileMutexReleaser(), 1000)

      logger.info(`Running transcription for ${video.uuid} in ${outputPath}`, lTags(video.uuid))

      const transcriptFile = await transcriber.transcribe({
        mediaFilePath: inputPath,

        model: CONFIG.VIDEO_TRANSCRIPTION.MODEL_PATH
          ? await TranscriptionModel.fromPath(CONFIG.VIDEO_TRANSCRIPTION.MODEL_PATH)
          : new WhisperBuiltinModel(CONFIG.VIDEO_TRANSCRIPTION.MODEL),

        transcriptDirectory: outputPath,

        format: 'vtt'
      })

      await onTranscriptionEnded({ video, language: transcriptFile.language, vttPath: transcriptFile.path })
    })
  } finally {
    if (outputPath) await remove(outputPath)
    if (inputFileMutexReleaser) inputFileMutexReleaser()

    VideoJobInfoModel.decrease(options.video.uuid, 'pendingTranscription')
      .catch(err => logger.error('Cannot decrease pendingTranscription job count', { err, ...lTags(options.video.uuid) }))
  }
}

export async function onTranscriptionEnded (options: {
  video: MVideoFullLight
  language: string
  vttPath: string
  lTags?: (string | number)[]
}) {
  const { video, language, vttPath, lTags: customLTags = [] } = options

  if (!isVideoCaptionLanguageValid(language)) {
    logger.warn(`Invalid transcription language for video ${video.uuid}`, lTags(video.uuid))
    return
  }

  if (!video.language) {
    video.language = language
    await video.save()
  }

  const existing = await VideoCaptionModel.loadByVideoIdAndLanguage(video.id, language)
  if (existing && !existing.automaticallyGenerated) {
    logger.info(
      // eslint-disable-next-line max-len
      `Do not replace existing caption for video ${video.uuid} after transcription (subtitle may have been added while during the transcription process)`,
      lTags(video.uuid)
    )
    return
  }

  const caption = await createLocalCaption({
    video,
    language,
    path: vttPath,
    automaticallyGenerated: true
  })

  await sequelizeTypescript.transaction(async t => {
    await federateVideoIfNeeded(video, false, t)
  })

  Notifier.Instance.notifyOfGeneratedVideoTranscription(caption)

  logger.info(`Transcription ended for ${video.uuid}`, lTags(video.uuid, ...customLTags))
}
Separate HLS audio and video streams Allows: * The HLS player to propose an "Audio only" resolution * The live to output an "Audio only" resolution * The live to ingest and output an "Audio only" stream This feature is under a config for VOD videos and is enabled by default for lives In the future we can imagine: * To propose multiple audio streams for a specific video * To ingest an audio only VOD and just output an audio only "video" (the player would play the audio file and PeerTube would not generate additional resolutions) This commit introduce a new way to download videos: * Add "/download/videos/generate/:videoId" endpoint where PeerTube can mux an audio only and a video only file to a mp4 container * The download client modal introduces a new default panel where the user can choose resolutions it wants to download 2024-07-23 16:38:51 +02:00			`import { VideoFileStream } from '@peertube/peertube-models'`
Integrate transcription in PeerTube 2024-06-13 09:23:12 +02:00			`import { buildSUUID } from '@peertube/peertube-node-utils'`
			`import { AbstractTranscriber, TranscriptionModel, WhisperBuiltinModel, transcriberFactory } from '@peertube/peertube-transcription'`
Refactor caption creation 2024-02-14 09:21:53 +01:00			`import { moveAndProcessCaptionFile } from '@server/helpers/captions-utils.js'`
Integrate transcription in PeerTube 2024-06-13 09:23:12 +02:00			`import { isVideoCaptionLanguageValid } from '@server/helpers/custom-validators/video-captions.js'`
			`import { logger, loggerTagsFactory } from '@server/helpers/logger.js'`
			`import { CONFIG } from '@server/initializers/config.js'`
			`import { DIRECTORIES } from '@server/initializers/constants.js'`
Refactor caption creation 2024-02-14 09:21:53 +01:00			`import { sequelizeTypescript } from '@server/initializers/database.js'`
			`import { VideoCaptionModel } from '@server/models/video/video-caption.js'`
Integrate transcription in PeerTube 2024-06-13 09:23:12 +02:00			`import { VideoJobInfoModel } from '@server/models/video/video-job-info.js'`
			`import { VideoModel } from '@server/models/video/video.js'`
			`import { MVideo, MVideoCaption, MVideoFullLight, MVideoUUID, MVideoUrl } from '@server/types/models/index.js'`
Run transcription after studio 2024-07-11 11:29:46 +02:00			`import { MutexInterface } from 'async-mutex'`
Integrate transcription in PeerTube 2024-06-13 09:23:12 +02:00			`import { ensureDir, remove } from 'fs-extra/esm'`
			`import { join } from 'path'`
			`import { federateVideoIfNeeded } from './activitypub/videos/federate.js'`
			`import { JobQueue } from './job-queue/job-queue.js'`
			`import { Notifier } from './notifier/notifier.js'`
			`import { TranscriptionJobHandler } from './runners/index.js'`
			`import { VideoPathManager } from './video-path-manager.js'`
More robust caption update Avoid file not found when the transaction is retried 2024-09-12 10:52:18 +02:00			`import { retryTransactionWrapper } from '@server/helpers/database-utils.js'`
Integrate transcription in PeerTube 2024-06-13 09:23:12 +02:00
			`const lTags = loggerTagsFactory('video-caption')`
Refactor caption creation 2024-02-14 09:21:53 +01:00
			`export async function createLocalCaption (options: {`
			`video: MVideo`
			`path: string`
			`language: string`
Metadata to know if the caption is auto generated 2024-06-27 15:29:26 +02:00			`automaticallyGenerated: boolean`
Refactor caption creation 2024-02-14 09:21:53 +01:00			`}) {`
Metadata to know if the caption is auto generated 2024-06-27 15:29:26 +02:00			`const { language, path, video, automaticallyGenerated } = options`
Refactor caption creation 2024-02-14 09:21:53 +01:00
			`const videoCaption = new VideoCaptionModel({`
			`videoId: video.id,`
			`filename: VideoCaptionModel.generateCaptionName(language),`
Metadata to know if the caption is auto generated 2024-06-27 15:29:26 +02:00			`language,`
			`automaticallyGenerated`
Refactor caption creation 2024-02-14 09:21:53 +01:00			`}) as MVideoCaption`

			`await moveAndProcessCaptionFile({ path }, videoCaption)`

More robust caption update Avoid file not found when the transaction is retried 2024-09-12 10:52:18 +02:00			`await retryTransactionWrapper(() => {`
			`return sequelizeTypescript.transaction(t => {`
			`return VideoCaptionModel.insertOrReplaceLanguage(videoCaption, t)`
			`})`
Refactor caption creation 2024-02-14 09:21:53 +01:00			`})`

Integrate transcription in PeerTube 2024-06-13 09:23:12 +02:00			`return Object.assign(videoCaption, { Video: video })`
			`}`

			`export async function createTranscriptionTaskIfNeeded (video: MVideoUUID & MVideoUrl) {`
			`if (CONFIG.VIDEO_TRANSCRIPTION.ENABLED !== true) return`

			logger.info(`Creating transcription job for ${video.url}`, lTags(video.uuid))

			`if (CONFIG.VIDEO_TRANSCRIPTION.REMOTE_RUNNERS.ENABLED === true) {`
			`await new TranscriptionJobHandler().create({ video })`
			`} else {`
			`await JobQueue.Instance.createJob({ type: 'video-transcription', payload: { videoUUID: video.uuid } })`
			`}`

			`await VideoJobInfoModel.increaseOrCreate(video.uuid, 'pendingTranscription')`
			`}`

			`// ---------------------------------------------------------------------------`
			`// Transcription task`
			`// ---------------------------------------------------------------------------`

			`let transcriber: AbstractTranscriber`

			`export async function generateSubtitle (options: {`
			`video: MVideoUUID`
			`}) {`
			`const outputPath = join(CONFIG.STORAGE.TMP_DIR, 'transcription', buildSUUID())`

Don't block video files on transcription 2024-07-04 11:18:04 +02:00			`let inputFileMutexReleaser: MutexInterface.Releaser`
Integrate transcription in PeerTube 2024-06-13 09:23:12 +02:00
			`try {`
Don't block video files on transcription 2024-07-04 11:18:04 +02:00			`await ensureDir(outputPath)`

			`const binDirectory = join(DIRECTORIES.LOCAL_PIP_DIRECTORY, 'bin')`

Integrate transcription in PeerTube 2024-06-13 09:23:12 +02:00			`// Lazy load the transcriber`
			`if (!transcriber) {`
			`transcriber = transcriberFactory.createFromEngineName({`
			`engineName: CONFIG.VIDEO_TRANSCRIPTION.ENGINE,`
			`enginePath: CONFIG.VIDEO_TRANSCRIPTION.ENGINE_PATH,`
			`logger,`
			`binDirectory`
			`})`

			`if (!CONFIG.VIDEO_TRANSCRIPTION.ENGINE_PATH) {`
			logger.info(`Installing transcriber ${transcriber.engine.name} to generate subtitles`, lTags())
			`await transcriber.install(DIRECTORIES.LOCAL_PIP_DIRECTORY)`
			`}`
			`}`

Don't block video files on transcription 2024-07-04 11:18:04 +02:00			`inputFileMutexReleaser = await VideoPathManager.Instance.lockFiles(options.video.uuid)`

Integrate transcription in PeerTube 2024-06-13 09:23:12 +02:00			`const video = await VideoModel.loadFull(options.video.uuid)`
Separate HLS audio and video streams Allows: * The HLS player to propose an "Audio only" resolution * The live to output an "Audio only" resolution * The live to ingest and output an "Audio only" stream This feature is under a config for VOD videos and is enabled by default for lives In the future we can imagine: * To propose multiple audio streams for a specific video * To ingest an audio only VOD and just output an audio only "video" (the player would play the audio file and PeerTube would not generate additional resolutions) This commit introduce a new way to download videos: * Add "/download/videos/generate/:videoId" endpoint where PeerTube can mux an audio only and a video only file to a mp4 container * The download client modal introduces a new default panel where the user can choose resolutions it wants to download 2024-07-23 16:38:51 +02:00			`if (!video) {`
			`logger.info('Do not process transcription, video does not exist anymore.', lTags(options.video.uuid))`
			`return undefined`
			`}`
Integrate transcription in PeerTube 2024-06-13 09:23:12 +02:00
Separate HLS audio and video streams Allows: * The HLS player to propose an "Audio only" resolution * The live to output an "Audio only" resolution * The live to ingest and output an "Audio only" stream This feature is under a config for VOD videos and is enabled by default for lives In the future we can imagine: * To propose multiple audio streams for a specific video * To ingest an audio only VOD and just output an audio only "video" (the player would play the audio file and PeerTube would not generate additional resolutions) This commit introduce a new way to download videos: * Add "/download/videos/generate/:videoId" endpoint where PeerTube can mux an audio only and a video only file to a mp4 container * The download client modal introduces a new default panel where the user can choose resolutions it wants to download 2024-07-23 16:38:51 +02:00			`const file = video.getMaxQualityFile(VideoFileStream.AUDIO)`
Integrate transcription in PeerTube 2024-06-13 09:23:12 +02:00
Separate HLS audio and video streams Allows: * The HLS player to propose an "Audio only" resolution * The live to output an "Audio only" resolution * The live to ingest and output an "Audio only" stream This feature is under a config for VOD videos and is enabled by default for lives In the future we can imagine: * To propose multiple audio streams for a specific video * To ingest an audio only VOD and just output an audio only "video" (the player would play the audio file and PeerTube would not generate additional resolutions) This commit introduce a new way to download videos: * Add "/download/videos/generate/:videoId" endpoint where PeerTube can mux an audio only and a video only file to a mp4 container * The download client modal introduces a new default panel where the user can choose resolutions it wants to download 2024-07-23 16:38:51 +02:00			`if (!file) {`
			`logger.info(`
			`Do not run transcription for ${video.uuid} in ${outputPath} because it does not contain an audio stream`,
			`{ video, ...lTags(video.uuid) }`
			`)`

			`return`
			`}`
Integrate transcription in PeerTube 2024-06-13 09:23:12 +02:00
Separate HLS audio and video streams Allows: * The HLS player to propose an "Audio only" resolution * The live to output an "Audio only" resolution * The live to ingest and output an "Audio only" stream This feature is under a config for VOD videos and is enabled by default for lives In the future we can imagine: * To propose multiple audio streams for a specific video * To ingest an audio only VOD and just output an audio only "video" (the player would play the audio file and PeerTube would not generate additional resolutions) This commit introduce a new way to download videos: * Add "/download/videos/generate/:videoId" endpoint where PeerTube can mux an audio only and a video only file to a mp4 container * The download client modal introduces a new default panel where the user can choose resolutions it wants to download 2024-07-23 16:38:51 +02:00			`await VideoPathManager.Instance.makeAvailableVideoFile(file, async inputPath => {`
Don't block video files on transcription 2024-07-04 11:18:04 +02:00			`// Release input file mutex now we are going to run the command`
			`setTimeout(() => inputFileMutexReleaser(), 1000)`

Integrate transcription in PeerTube 2024-06-13 09:23:12 +02:00			logger.info(`Running transcription for ${video.uuid} in ${outputPath}`, lTags(video.uuid))

			`const transcriptFile = await transcriber.transcribe({`
Separate HLS audio and video streams Allows: * The HLS player to propose an "Audio only" resolution * The live to output an "Audio only" resolution * The live to ingest and output an "Audio only" stream This feature is under a config for VOD videos and is enabled by default for lives In the future we can imagine: * To propose multiple audio streams for a specific video * To ingest an audio only VOD and just output an audio only "video" (the player would play the audio file and PeerTube would not generate additional resolutions) This commit introduce a new way to download videos: * Add "/download/videos/generate/:videoId" endpoint where PeerTube can mux an audio only and a video only file to a mp4 container * The download client modal introduces a new default panel where the user can choose resolutions it wants to download 2024-07-23 16:38:51 +02:00			`mediaFilePath: inputPath,`
Integrate transcription in PeerTube 2024-06-13 09:23:12 +02:00
			`model: CONFIG.VIDEO_TRANSCRIPTION.MODEL_PATH`
			`? await TranscriptionModel.fromPath(CONFIG.VIDEO_TRANSCRIPTION.MODEL_PATH)`
			`: new WhisperBuiltinModel(CONFIG.VIDEO_TRANSCRIPTION.MODEL),`

			`transcriptDirectory: outputPath,`

			`format: 'vtt'`
			`})`

			`await onTranscriptionEnded({ video, language: transcriptFile.language, vttPath: transcriptFile.path })`
			`})`
			`} finally {`
			`if (outputPath) await remove(outputPath)`
Don't block video files on transcription 2024-07-04 11:18:04 +02:00			`if (inputFileMutexReleaser) inputFileMutexReleaser()`
Integrate transcription in PeerTube 2024-06-13 09:23:12 +02:00
More robust transcription error job handler 2024-07-01 14:38:19 +02:00			`VideoJobInfoModel.decrease(options.video.uuid, 'pendingTranscription')`
			`.catch(err => logger.error('Cannot decrease pendingTranscription job count', { err, ...lTags(options.video.uuid) }))`
Integrate transcription in PeerTube 2024-06-13 09:23:12 +02:00			`}`
			`}`

			`export async function onTranscriptionEnded (options: {`
			`video: MVideoFullLight`
			`language: string`
			`vttPath: string`
			`lTags?: (string \| number)[]`
			`}) {`
			`const { video, language, vttPath, lTags: customLTags = [] } = options`

			`if (!isVideoCaptionLanguageValid(language)) {`
Don't create pendingTranscription twice 2024-07-03 08:32:14 +02:00			logger.warn(`Invalid transcription language for video ${video.uuid}`, lTags(video.uuid))
Integrate transcription in PeerTube 2024-06-13 09:23:12 +02:00			`return`
			`}`

			`if (!video.language) {`
			`video.language = language`
			`await video.save()`
			`}`

Don't replace caption on transcription 2024-07-12 16:20:26 +02:00			`const existing = await VideoCaptionModel.loadByVideoIdAndLanguage(video.id, language)`
			`if (existing && !existing.automaticallyGenerated) {`
			`logger.info(`
			`// eslint-disable-next-line max-len`
			`Do not replace existing caption for video ${video.uuid} after transcription (subtitle may have been added while during the transcription process)`,
			`lTags(video.uuid)`
			`)`
			`return`
			`}`

Integrate transcription in PeerTube 2024-06-13 09:23:12 +02:00			`const caption = await createLocalCaption({`
			`video,`
			`language,`
Metadata to know if the caption is auto generated 2024-06-27 15:29:26 +02:00			`path: vttPath,`
			`automaticallyGenerated: true`
Integrate transcription in PeerTube 2024-06-13 09:23:12 +02:00			`})`

			`await sequelizeTypescript.transaction(async t => {`
			`await federateVideoIfNeeded(video, false, t)`
			`})`

			`Notifier.Instance.notifyOfGeneratedVideoTranscription(caption)`

			logger.info(`Transcription ended for ${video.uuid}`, lTags(video.uuid, ...customLTags))
Refactor caption creation 2024-02-14 09:21:53 +01:00			`}`