Files
Telereview/code-speech-To-Text.py
wissal belhorma 89412b48f1 reconnaissance
2023-03-24 13:15:59 +01:00

168 lines
5.8 KiB
Python

import librosa
import os
from matplotlib.patches import ConnectionPatch
import matplotlib.pyplot as plt
import numpy as np
import scipy.spatial.distance as dist
import pyaudio
import wave
def dp(distmat):
N,M = distmat.shape
# Initialisons the cost matrix
costmat =np.zeros((N+1,M+1))
for i in range (1,N+1):
costmat[i,0]=np.inf
for i in range (1,M+1):
costmat[0,i]=np.inf
for i in range (N):
for j in range (M):
#on calcule le cout minimal pour chaque chemin.pour atteindre the costmat[i][j] il y a trois chemins possibles on choisit celui de cout minimal
penalty = [
costmat[i,j], # cas T==0
costmat[i,j+1] , # cas T==1
costmat[i+1,j]] # cas T==2
ipenalty = np.argmin(penalty)
costmat[i+1,j+1] = distmat[i,j] + penalty[ipenalty]
#enlever les valeurs de l infini
costmat = costmat[1: , 1:]
return (costmat, costmat[-1, -1]/(N+M))
def calculate_mfcc(audio, sr):
# Define parameters for MFCC calculation
n_mfcc = 13
n_fft = 2048
hop_length = 512
fmin = 0
fmax = sr/2
# Calculate MFCCs
mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc, n_fft=n_fft, hop_length=hop_length, fmin=fmin, fmax=fmax)
return mfccs.T
def calculate_dtw_cost(mfccs_query , mfccs_train):
distmat = dist.cdist(mfccs_query, mfccs_train,"cosine")
costmat,mincost = dp(distmat)
return mincost
def recognize_speech(audio_query, audio_train_list, sr):#sr frequence d echantillonnage
# Calculate MFCCs for query audio
mfccs_query = calculate_mfcc(audio_query, sr)
# Calculate DTW cost for each audio in training data
dtw_costs = []
for audio_train in audio_train_list:
mfccs_train = calculate_mfcc(audio_train, sr)
mincost = calculate_dtw_cost(mfccs_query, mfccs_train)
dtw_costs.append(mincost)
# Find index of word with lowest DTW cost
index = np.argmin(dtw_costs)
# Return recognized word
return index
# Example usage
def get_recognized_word(recognized_word_index):
# Define a dictionary to map recognized word indices to actual words
word_map = {
"un" : [0,1,2,3,4,5,6],
"deux" : [7, 8, 9, 10, 11, 12, 13],
"trois" : [14, 15, 16, 17, 18, 19],
"quatre" : [20,21, 22, 23, 24, 25, 26],
"cinq" : [27 ,28, 29, 30, 31, 32],
"six" : [33 ,34, 35, 36, 37, 38],
"sept" : [39 , 40, 41, 42, 43, 44],
"huit" : [45,46, 47, 48, 49, 50, 51],
"neuf" : [52,53, 54, 55, 56, 57, 58],
"dix" : [59,60, 61, 62, 63, 64, 65],
"bien" : [66 ,67, 68, 69, 70, 71, 72],
"super" : [127,128,129,130, 131, 132, 133],
"génial" : [87,88, 89, 90, 91, 92, 93],
"sympa" : [134,135,136,137, 138, 139, 140],
"propre" : [122, 123, 124, 125, 126],
"nul" : [115 ,116, 117, 118, 119, 120, 121],
"ennuyant" : [80 ,81, 82, 83, 84, 85, 86],
"j'ai beaucoup aimé" : [94 ,95, 96, 97, 98, 99, 100],
"j'ai trouvé ça génial" : [101 ,102, 103, 104, 105, 106, 107],
"je n'ai pas aimé" : [108 ,109, 110, 111, 112, 113, 114],
"c'était drole" : [73,74, 75, 76, 77, 78, 79],
}
for word, indices in word_map.items():
if recognized_word_index in indices:
return word
return "Word not recognized"
def record_audio(filename, duration, sr):
chunk = 1024
sample_format = pyaudio.paInt16
channels = 1
record_seconds = duration
filename = f"{filename}.wav"
p = pyaudio.PyAudio()
stream = p.open(format=sample_format,
channels=channels,
rate=sr,
frames_per_buffer=chunk,
input=True)
frames = []
print(f"Enregistrement en cours...")
for i in range(0, int(sr / chunk * record_seconds)):
data = stream.read(chunk)
frames.append(data)
stream.stop_stream()
stream.close()
p.terminate()
print("Enregistrement terminé")
wf = wave.open(filename, "wb")
wf.setnchannels(channels)
wf.setsampwidth(p.get_sample_size(sample_format))
wf.setframerate(sr)
wf.writeframes(b"".join(frames))
wf.close()
print(f"Fichier enregistré sous {filename}")
def coupe_silence(signal):
t = 0
if signal[t] == 0 :
p = 0
while signal[t+p] == 0 :
if p == 88 :
signal = signal[:t] + signal[t+p:]
coupe_silence(signal)
else :
p = p+1
"""
sr = 44100 # fréquence d'échantillonnage
duration = 2.5 # durée d'enregistrement en secondes
filename = "audio_query" # nom du fichier à enregistrer
record_audio(filename, duration, sr)
audio_query, sr = librosa.load('C:\\Users\\HP\\audio_query.wav', sr=sr)
audio_train_list = [librosa.load('C:\\Users\\HP\\Documents\\cool.wav', sr=sr)[0], librosa.load('C:\\Users\\HP\\Documents\\formidable.wav', sr=sr)[0], librosa.load('C:\\Users\\HP\\Documents\\cest mauvais.wav', sr=sr)[0] , librosa.load('C:\\Users\\HP\\Documents\\un.wav', sr=sr)[0], librosa.load('C:\\Users\\HP\\Documents\\parfait.wav', sr=sr)[0]]
recognized_word_index = recognize_speech(audio_query, audio_train_list, sr)
print(f'Recognized word: {recognized_word_index}')
"""
sr = 44100 # fréquence d'échantillonnage
duration = 6 # durée d'enregistrement en secondes
filename = "audio_query" # nom du fichier à enregistrer
record_audio(filename, duration, sr)
audio_query, sr = librosa.load('C:\\Users\\HP\\audio_query.wav', sr=sr)
coupe_silence(audio_query)
audio_train_list = []
for file in os.listdir('C:\\Users\\HP\\Documents\\Base de données') :
audio_train_list.append(librosa.load('C:\\Users\\HP\\Documents\\Base de données\\' + file, sr=sr)[0])
recognized_word_index = recognize_speech(audio_query, audio_train_list, sr)
recognized_word = get_recognized_word(recognized_word_index)
print(f'Recognized word: {recognized_word}')