当前位置:网站首页>Audio knowledge (III) -- MFCCs code implementation

Audio knowledge (III) -- MFCCs code implementation

2022-06-24 11:08:00 languageX

In the last introduction MFCCs The principle and process of extraction , This paper introduces the use of python Realization MFCCs.

In retrospect MFCC Main process

1. Read audio

2. Pre emphasis

3. Add windows

4. Framing

5. The Fourier transform

6. obtain mel Spectrum

7. Discrete cosine transform , obtain mel Spectral cepstrum

Here is the code for each step , The main process has detailed comments in the code :

# -*- coding: utf-8 -*-
# @Time    : 2021-05-10 15:41
# @Author  :

import numpy as np
import scipy
from scipy.fftpack import dct
from scipy.io import wavfile
#  Framing window length 
WIN_LEN = 255
#  Sampling interval 
HOP_LEN = 125
# FFT Number 
N_FFT = 255
# mel Number of filters 
N_FILT = 40
#  Number of cepstrum coefficients 
NUM_CEPS = 13
#  Audio sampling rate 
sample_rate = 16000

def read_audio(wave_path):
    """  Read audio 
    :param wave_path:
    :return:
    """
    rate, data = wavfile.read(wave_path)
    data = np.round(32767 * data)
    return data


def pre_emphasised(data):
    """  Pre emphasis 
    :rtype: object
    """
    pre_emphasis = 0.96
    data = np.append([data[0]], [(data[i + 1] - pre_emphasis * data[i]) for i in range(len(data) - 1)])
    return data

def get_hann_window(length=255):
    """ hanning window 
    """
    window = np.hanning(length)
    window.shape = [1, -1]
    return window.astype(np.float32)

def get_frames(pcm, frame_len, hop_len):
    """  Framing 
    :rtype: [ Number of frames , The length of the frame ]
    """
    pcm_len = len(pcm)

    frames_num = 1 + (pcm_len - frame_len) // hop_len
    frames_num = int(frames_num)
    frames = []
    for i in range(frames_num):
        s = i * hop_len
        e = s + frame_len
        if e > pcm_len:
            e = pcm_len
        frame = pcm[s: e]
        frame = np.pad(frame, (0, frame_len - len(frame)), 'constant')
        frame.shape = [1, -1]
        frames.append(frame)
    frames = np.concatenate(frames, axis=0)
    return frames

def stft(frames):
    """  Calculate short time Fourier transform and power spectrum 
    :param frames:  Data after framing 
    :return:  Power spectrum 
    """
    # fft Amplitude after 
    mag_frames = np.absolute(np.fft.rfft(frames, N_FFT))
    #  Power spectrum 
    pow_frames = ((1.0 / N_FFT) * ((mag_frames) ** 2))
    print("pow_frames", pow_frames.shape)
    return pow_frames

def get_filter_bank(pow_frames):
    """  extract mel Scale and logarithmic energy value of each frequency band 
    """
    low_freq_mel = 0
    #  Frequency conversion to Mel scale 
    high_freq_mel = (2595 * np.log10(1 + (sample_rate / 2) / 700))
    #  Yes mel Linear partition 
    mel_points = np.linspace(low_freq_mel, high_freq_mel, N_FILT + 2)
    # Mel On the scale point Turn frequency 
    hz_points = (700 * (10 ** (mel_points / 2595) - 1))
    bin = np.floor((N_FFT + 1) * hz_points / sample_rate)
    fbank = np.zeros((N_FILT, int(np.floor(N_FFT / 2 + 1))))

    for m in range(1, N_FILT + 1):
        # left
        f_m_minus = int(bin[m - 1])
        # center
        f_m = int(bin[m])
        # right
        f_m_plus = int(bin[m + 1])
        for k in range(f_m_minus, f_m):
            fbank[m - 1, k] = (k - bin[m - 1]) / (bin[m] - bin[m - 1])
        for k in range(f_m, f_m_plus):
            fbank[m - 1, k] = (bin[m + 1] - k) / (bin[m + 1] - bin[m])
    print("pow_frames,fbank", pow_frames.shape,fbank.shape)
    # [num_frame,pow_frame] dot [num_filter, num_pow]
    #  The logarithmic energy value of each frame is multiplied and accumulated in the corresponding filter frequency band 
    filter_banks = np.dot(pow_frames, fbank.T)
    filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)
    #  Take logarithm of energy 
    filter_banks = 20 * np.log10(filter_banks)
    print("filter_banks", filter_banks.shape)
    return filter_banks

def get_MFCCs(filter_banks):
    """  Get final MFCC coefficient 
    :param filter_banks:  after Mel Logarithmic energy of the filter 
    """
    #  Logarithmic energy is brought into the discrete cosine transform formula 
    mfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, 1: (NUM_CEPS + 1)]
    (nframes, ncoeff) = mfcc.shape
    print("mfcc.shape", mfcc.shape)


if __name__ == '__main__':
    file = "test.wav"
    #  Read audio 
    data = read_audio(file)
    #  Pre emphasis 
    data = pre_emphasised(data)
    #  Get Hanning window 
    _han = get_hann_window()
    #  Framing 
    frames = get_frames(data, WIN_LEN, HOP_LEN)
    #  Add windows 
    frames = frames*_han
    #  Fourier transformation + Get the power spectrum 
    pow_frames = stft(frames)
    # mel Filter acquisition mel Logarithmic power spectrum 
    filter_banks = get_filter_bank(pow_frames)
    #  Discrete cosine transform , obtain mel Spectral cepstrum coefficient 
    get_MFCCs(filter_banks)
原网站

版权声明
本文为[languageX]所创,转载请带上原文链接,感谢
https://yzsam.com/2021/06/20210604202514777f.html