Dunfey · Hotel WWDC as data, est. 1983
Front desk everything
Years
Topics

2026 AI & Machine LearningAudio & Video

WWDC26 · 17 min · AI & Machine Learning / Audio & Video

Meet the Music Understanding framework

Discover Music Understanding, a new framework that lets your app analyze audio across six dimensions, on device: key, rhythm, structure, pace, instrument activity, and loudness. And use the Music Understanding Lab sample app to visualize each result.

Watch at developer.apple.com ↗

Transcript all transcripts

Chapters

Code shown on screen · 18 snippets

Initialize the session swift · at 4:47 ↗
import MusicUnderstanding

.fileImporter(isPresented: $isPresented, allowedContentTypes: [.audio]) { result in
    switch result {
    case .success(let url):
        let asset = AVURLAsset(url: url, 
                               options: [AVURLAssetPreferPreciseDurationAndTimingKey : true])
        let session = try await MusicUnderstandingSession(asset: asset)
        let results = try await session.analyze()
    }
}
Inside SessionResult swift · at 5:24 ↗
import MusicUnderstanding

public struct SessionResult: Codable, Sendable {
    public let instrumentActivity: InstrumentActivityResult?
    public let key: KeyResult?
    public let loudness: LoudnessResult?
    public let pace: PaceResult?
    public let rhythm: RhythmResult?
    public let structure: StructureResult?
}
TimedValue swift · at 5:53 ↗
import MusicUnderstanding

public struct TimedValue<Value>: Codable, Equatable, Sendable
where Value: Codable & Equatable & Sendable {
    public let time: CMTime
    public let value: Value
}
RangedValue swift · at 5:58 ↗
import MusicUnderstanding

public struct RangedValue<Value>: Codable, Equatable, Sendable
where Value: Codable & Equatable & Sendable {
    public let range: CMTimeRange
    public let value: Value
}
Key analysis swift · at 6:27 ↗
public struct KeyResult: Codable, Sendable {
    public let ranges: [MusicUnderstandingSession.RangedValue<KeySignature]
}
KeySignature swift · at 6:43 ↗
public struct KeySignature: Codable, Hashable, Sendable {
    public let tonic: Tonic
    public let mode: Mode
}
Using tonic swift · at 6:48 ↗
@frozen public enum Tonic: String, Codable, Hashable, Sendable {
    case aFlat, aSharp, a, bFlat, b, c, cSharp, d, dFlat, dSharp, eFlat, e, f, fSharp, g, gFlat, gSharp
}
Using mode swift · at 6:59 ↗
public enum Mode: String, Codable, Hashable, Sendable {
    case major, minor
}
Rhythm analysis swift · at 7:16 ↗
import MusicUnderstanding

public struct RhythmResult: Codable, Sendable {
    public let beats: [CMTime]
    public let bars: [CMTime]
    public let beatsPerMinute: Float?
}
StructureResult swift · at 8:42 ↗
import MusicUnderstanding

public struct StructureResult: Codable, Sendable {
    public let sections: [CMTimeRange]
    public let segments: [CMTimeRange]
    public let phrases: [CMTimeRange]
}
Analyzing pace swift · at 9:26 ↗
import MusicUnderstanding

public struct PaceResult: Codable, Sendable {
    public let ranges: [MusicUnderstandingSession.RangedValue<Double>]
}
InstrumentActivityResult swift · at 10:13 ↗
import MusicUnderstanding

public struct InstrumentActivityResult: Codable, Sendable {
    public let ranges: [Instrument: [CMTimeRange]]
    public let activity: [Instrument: [MusicUnderstandingSession.TimedValue<Float>]]
}
LoudnessResult swift · at 11:45 ↗
import MusicUnderstanding

public struct LoudnessResult: Codable, Sendable {
    public let integrated: MusicUnderstandingSession.TimedValue<Float>
    public let momentary: [MusicUnderstandingSession.TimedValue<Float>]
    public let shortTerm: [MusicUnderstandingSession.TimedValue<Float>]
    public let peak: MusicUnderstandingSession.TimedValue<Float>
}
Streaming API for loudness swift · at 12:48 ↗
import MusicUnderstanding

public var loudnessResults: some AsyncSequence<LoudnessResult, any Error> & Sendable
Streaming API for loudness swift · at 12:55 ↗
import MusicUnderstanding

let audioProvider = AudioProvider()
let session = MusicUnderstandingSession(audioProvider: audioProvider)
await withThrowingTaskGroup(of: Void.self) { taskGroup in
    group.addTask {
        for try await result in await session.loudnessResults {
            updateAudioLevel(result.momentary.value)
        }
    }

    group.addTask {
        try await session.analyze(for: [.loudness])
    }
}
Audio Provider swift · at 13:19 ↗
import MusicUnderstanding

struct AudioProvider: AsyncSequence, AsyncIteratorProtocol {
   func makeAsyncIterator() -> Self {
        return self
    }

   mutating func next() async -> AVReadOnlyAudioPCMBuffer? {
        // Return the next audio buffer, or nil to signal completion
    }
}
Encode to JSON swift · at 13:55 ↗
import MusicUnderstanding

let session = try await MusicUnderstandingSession(asset: asset)
let results = try await session.analyze()

let encoder = JSONEncoder()
try encoder.encode(results)
Suggestion for using pace swift · at 14:47 ↗
let timePerClip = 60 / paceValue

Resources