import librosa import os from matplotlib.patches import ConnectionPatch import matplotlib.pyplot as plt import numpy as np import scipy.spatial.distance as dist import pyaudio import wave def dp(distmat): N,M = distmat.shape # Initialisons the cost matrix costmat =np.zeros((N+1,M+1)) for i in range (1,N+1): costmat[i,0]=np.inf for i in range (1,M+1): costmat[0,i]=np.inf for i in range (N): for j in range (M): #on calcule le cout minimal pour chaque chemin.pour atteindre the costmat[i][j] il y a trois chemins possibles on choisit celui de cout minimal penalty = [ costmat[i,j], # cas T==0 costmat[i,j+1] , # cas T==1 costmat[i+1,j]] # cas T==2 ipenalty = np.argmin(penalty) costmat[i+1,j+1] = distmat[i,j] + penalty[ipenalty] #enlever les valeurs de l infini costmat = costmat[1: , 1:] return (costmat, costmat[-1, -1]/(N+M)) def calculate_mfcc(audio, sr): # Define parameters for MFCC calculation n_mfcc = 13 n_fft = 2048 hop_length = 512 fmin = 0 fmax = sr/2 # Calculate MFCCs mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length, fmin=fmin, fmax=fmax) return mfccs.T def calculate_dtw_cost(mfccs_query , mfccs_train): distmat = dist.cdist(mfccs_query, mfccs_train,"cosine") costmat,mincost = dp(distmat) return mincost def recognize_speech(audio_query, audio_train_list, sr):#sr frequence d echantillonnage # Calculate MFCCs for query audio mfccs_query = calculate_mfcc(audio_query, sr) # Calculate DTW cost for each audio in training data dtw_costs = [] for audio_train in audio_train_list: mfccs_train = calculate_mfcc(audio_train, sr) mincost = calculate_dtw_cost(mfccs_query, mfccs_train) dtw_costs.append(mincost) # Find index of word with lowest DTW cost index = np.argmin(dtw_costs) # Return recognized word return index # Example usage def get_recognized_word(recognized_word_index): # Define a dictionary to map recognized word indices to actual words word_map = { "un" : [0,1,2,3,4,5,6], "deux" : [7, 8, 9, 10, 11, 12, 13], "trois" : [14, 15, 16, 17, 18, 19], "quatre" : [20,21, 22, 23, 24, 25, 26], "cinq" : [27 ,28, 29, 30, 31, 32], "six" : [33 ,34, 35, 36, 37, 38], "sept" : [39 , 40, 41, 42, 43, 44], "huit" : [45,46, 47, 48, 49, 50, 51], "neuf" : [52,53, 54, 55, 56, 57, 58], "dix" : [59,60, 61, 62, 63, 64, 65], "bien" : [66 ,67, 68, 69, 70, 71, 72], "super" : [127,128,129,130, 131, 132, 133], "génial" : [87,88, 89, 90, 91, 92, 93], "sympa" : [134,135,136,137, 138, 139, 140], "propre" : [122, 123, 124, 125, 126], "nul" : [115 ,116, 117, 118, 119, 120, 121], "ennuyant" : [80 ,81, 82, 83, 84, 85, 86], "j'ai beaucoup aimé" : [94 ,95, 96, 97, 98, 99, 100], "j'ai trouvé ça génial" : [101 ,102, 103, 104, 105, 106, 107], "je n'ai pas aimé" : [108 ,109, 110, 111, 112, 113, 114], "c'était drole" : [73,74, 75, 76, 77, 78, 79], } for word, indices in word_map.items(): if recognized_word_index in indices: return word return "Word not recognized" def record_audio(filename, duration, sr): chunk = 1024 sample_format = pyaudio.paInt16 channels = 1 record_seconds = duration filename = f"{filename}.wav" p = pyaudio.PyAudio() stream = p.open(format=sample_format, channels=channels, rate=sr, frames_per_buffer=chunk, input=True) frames = [] print(f"Enregistrement en cours...") for i in range(0, int(sr / chunk * record_seconds)): data = stream.read(chunk) frames.append(data) stream.stop_stream() stream.close() p.terminate() print("Enregistrement terminé") wf = wave.open(filename, "wb") wf.setnchannels(channels) wf.setsampwidth(p.get_sample_size(sample_format)) wf.setframerate(sr) wf.writeframes(b"".join(frames)) wf.close() print(f"Fichier enregistré sous {filename}") def coupe_silence(signal): t = 0 if signal[t] == 0 : p = 0 while signal[t+p] == 0 : if p == 88 : signal = signal[:t] + signal[t+p:] coupe_silence(signal) else : p = p+1 """ sr = 44100 # fréquence d'échantillonnage duration = 2.5 # durée d'enregistrement en secondes filename = "audio_query" # nom du fichier à enregistrer record_audio(filename, duration, sr) audio_query, sr = librosa.load('C:\\Users\\HP\\audio_query.wav', sr=sr) audio_train_list = [librosa.load('C:\\Users\\HP\\Documents\\cool.wav', sr=sr)[0], librosa.load('C:\\Users\\HP\\Documents\\formidable.wav', sr=sr)[0], librosa.load('C:\\Users\\HP\\Documents\\cest mauvais.wav', sr=sr)[0] , librosa.load('C:\\Users\\HP\\Documents\\un.wav', sr=sr)[0], librosa.load('C:\\Users\\HP\\Documents\\parfait.wav', sr=sr)[0]] recognized_word_index = recognize_speech(audio_query, audio_train_list, sr) print(f'Recognized word: {recognized_word_index}') """ sr = 44100 # fréquence d'échantillonnage duration = 6 # durée d'enregistrement en secondes filename = "audio_query" # nom du fichier à enregistrer record_audio(filename, duration, sr) audio_query, sr = librosa.load('C:\\Users\\HP\\audio_query.wav', sr=sr) coupe_silence(audio_query) audio_train_list = [] for file in os.listdir('C:\\Users\\HP\\Documents\\Base de données') : audio_train_list.append(librosa.load('C:\\Users\\HP\\Documents\\Base de données\\' + file, sr=sr)[0]) recognized_word_index = recognize_speech(audio_query, audio_train_list, sr) recognized_word = get_recognized_word(recognized_word_index) print(f'Recognized word: {recognized_word}')