2025 AI & Machine Learning
WWDC25 · 19 min · AI & Machine Learning
Bring advanced speech-to-text to your app with SpeechAnalyzer
Discover the new SpeechAnalyzer API for speech to text. We’ll learn about the Swift API and its capabilities, which power features in Notes, Voice Memos, Journal, and more. We’ll dive into details about how speech to text works and how SpeechAnalyzer and SpeechTranscriber can enable you to create exciting, performant features. And you’ll learn how to incorporate SpeechAnalyzer and live transcription into your app with a code-along.
Watch at developer.apple.com ↗Chapters
Code shown on screen · 16 snippets
Transcribe a file
// Set up transcriber. Read results asynchronously, and concatenate them together.
let transcriber = SpeechTranscriber(locale: locale, preset: .offlineTranscription)
async let transcriptionFuture = try transcriber.results
.reduce("") { str, result in str + result.text }
let analyzer = SpeechAnalyzer(modules: [transcriber])
if let lastSample = try await analyzer.analyzeSequence(from: file) {
try await analyzer.finalizeAndFinish(through: lastSample)
} else {
await analyzer.cancelAndFinishNow()
}
return try await transcriptionFuture Speech Transcriber setup (volatile results + timestamps)
func setUpTranscriber() async throws {
transcriber = SpeechTranscriber(locale: Locale.current,
transcriptionOptions: [],
reportingOptions: [.volatileResults],
attributeOptions: [.audioTimeRange])
} Speech Transcriber setup (volatile results, no timestamps)
// transcriber = SpeechTranscriber(locale: Locale.current, preset: .progressiveLiveTranscription) Set up SpeechAnalyzer
func setUpTranscriber() async throws {
transcriber = SpeechTranscriber(locale: Locale.current,
transcriptionOptions: [],
reportingOptions: [.volatileResults],
attributeOptions: [.audioTimeRange])
guard let transcriber else {
throw TranscriptionError.failedToSetupRecognitionStream
}
analyzer = SpeechAnalyzer(modules: [transcriber])
} Get audio format
func setUpTranscriber() async throws {
transcriber = SpeechTranscriber(locale: Locale.current,
transcriptionOptions: [],
reportingOptions: [.volatileResults],
attributeOptions: [.audioTimeRange])
guard let transcriber else {
throw TranscriptionError.failedToSetupRecognitionStream
}
analyzer = SpeechAnalyzer(modules: [transcriber])
self.analyzerFormat = await SpeechAnalyzer.bestAvailableAudioFormat(compatibleWith: [transcriber])
} Ensure models
func setUpTranscriber() async throws {
transcriber = SpeechTranscriber(locale: Locale.current,
transcriptionOptions: [],
reportingOptions: [.volatileResults],
attributeOptions: [.audioTimeRange])
guard let transcriber else {
throw TranscriptionError.failedToSetupRecognitionStream
}
analyzer = SpeechAnalyzer(modules: [transcriber])
self.analyzerFormat = await SpeechAnalyzer.bestAvailableAudioFormat(compatibleWith: [transcriber])
do {
try await ensureModel(transcriber: transcriber, locale: Locale.current)
} catch let error as TranscriptionError {
print(error)
return
}
} Finish SpeechAnalyzer setup
func setUpTranscriber() async throws {
transcriber = SpeechTranscriber(locale: Locale.current,
transcriptionOptions: [],
reportingOptions: [.volatileResults],
attributeOptions: [.audioTimeRange])
guard let transcriber else {
throw TranscriptionError.failedToSetupRecognitionStream
}
analyzer = SpeechAnalyzer(modules: [transcriber])
self.analyzerFormat = await SpeechAnalyzer.bestAvailableAudioFormat(compatibleWith: [transcriber])
do {
try await ensureModel(transcriber: transcriber, locale: Locale.current)
} catch let error as TranscriptionError {
print(error)
return
}
(inputSequence, inputBuilder) = AsyncStream<AnalyzerInput>.makeStream()
guard let inputSequence else { return }
try await analyzer?.start(inputSequence: inputSequence)
} Check for language support
public func ensureModel(transcriber: SpeechTranscriber, locale: Locale) async throws {
guard await supported(locale: locale) else {
throw TranscriptionError.localeNotSupported
}
}
func supported(locale: Locale) async -> Bool {
let supported = await SpeechTranscriber.supportedLocales
return supported.map { $0.identifier(.bcp47) }.contains(locale.identifier(.bcp47))
}
func installed(locale: Locale) async -> Bool {
let installed = await Set(SpeechTranscriber.installedLocales)
return installed.map { $0.identifier(.bcp47) }.contains(locale.identifier(.bcp47))
} Check for model installation
public func ensureModel(transcriber: SpeechTranscriber, locale: Locale) async throws {
guard await supported(locale: locale) else {
throw TranscriptionError.localeNotSupported
}
if await installed(locale: locale) {
return
} else {
try await downloadIfNeeded(for: transcriber)
}
}
func supported(locale: Locale) async -> Bool {
let supported = await SpeechTranscriber.supportedLocales
return supported.map { $0.identifier(.bcp47) }.contains(locale.identifier(.bcp47))
}
func installed(locale: Locale) async -> Bool {
let installed = await Set(SpeechTranscriber.installedLocales)
return installed.map { $0.identifier(.bcp47) }.contains(locale.identifier(.bcp47))
} Download the model
func downloadIfNeeded(for module: SpeechTranscriber) async throws {
if let downloader = try await AssetInventory.assetInstallationRequest(supporting: [module]) {
self.downloadProgress = downloader.progress
try await downloader.downloadAndInstall()
}
} Deallocate an asset
func deallocate() async {
let allocated = await AssetInventory.allocatedLocales
for locale in allocated {
await AssetInventory.deallocate(locale: locale)
}
} Speech result handling
recognizerTask = Task {
do {
for try await case let result in transcriber.results {
let text = result.text
if result.isFinal {
finalizedTranscript += text
volatileTranscript = ""
updateStoryWithNewText(withFinal: text)
print(text.audioTimeRange)
} else {
volatileTranscript = text
volatileTranscript.foregroundColor = .purple.opacity(0.4)
}
}
} catch {
print("speech recognition failed")
}
} Set up audio recording
func record() async throws {
self.story.url.wrappedValue = url
guard await isAuthorized() else {
print("user denied mic permission")
return
}
#if os(iOS)
try setUpAudioSession()
#endif
try await transcriber.setUpTranscriber()
for await input in try await audioStream() {
try await self.transcriber.streamAudioToTranscriber(input)
}
} Set up audio recording via AVAudioEngine
#if os(iOS)
func setUpAudioSession() throws {
let audioSession = AVAudioSession.sharedInstance()
try audioSession.setCategory(.playAndRecord, mode: .spokenAudio)
try audioSession.setActive(true, options: .notifyOthersOnDeactivation)
}
#endif
private func audioStream() async throws -> AsyncStream<AVAudioPCMBuffer> {
try setupAudioEngine()
audioEngine.inputNode.installTap(onBus: 0,
bufferSize: 4096,
format: audioEngine.inputNode.outputFormat(forBus: 0)) { [weak self] (buffer, time) in
guard let self else { return }
writeBufferToDisk(buffer: buffer)
self.outputContinuation?.yield(buffer)
}
audioEngine.prepare()
try audioEngine.start()
return AsyncStream(AVAudioPCMBuffer.self, bufferingPolicy: .unbounded) {
continuation in
outputContinuation = continuation
}
} Stream audio to SpeechAnalyzer and SpeechTranscriber
func streamAudioToTranscriber(_ buffer: AVAudioPCMBuffer) async throws {
guard let inputBuilder, let analyzerFormat else {
throw TranscriptionError.invalidAudioDataType
}
let converted = try self.converter.convertBuffer(buffer, to: analyzerFormat)
let input = AnalyzerInput(buffer: converted)
inputBuilder.yield(input)
} Finalize the transcript stream
try await analyzer?.finalizeAndFinishThroughEndOfInput() Resources
Related sessions
-
13 min