preparation intégration reconaissance vocale

2026-04-10 16:40:20 +02:00 · 2023-03-23 09:53:16 +01:00
parent bc0270b707
commit e097c1fd23
13 changed files with 87 additions and 32 deletions
--- a/code/backend_reconnaissance/Dockerfile
+++ b/code/backend_reconnaissance/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.8-slim
+FROM python:3.8

 #Ne pas créer les fichiers .pyc
 ENV PYTHONDONTWRITEBYTECODE=1
@@ -7,7 +7,7 @@ ENV PYTHONUNBUFFERED=1

 #Installation des dépendances de opencv 
 RUN apt-get update
-RUN apt-get install ffmpeg libsm6 libxext6  -y
+RUN apt-get install ffmpeg libsm6 libxext6 portaudio19-dev python3-pyaudio -y

 # Installation des dépendances python
 COPY requirements.txt .
--- a/code/backend_reconnaissance/audio_detector.py
+++ b/code/backend_reconnaissance/audio_detector.py
@@ -0,0 +1,208 @@
+import librosa
+import os
+import numpy as np
+import math
+from scipy.io import wavfile
+import wave
+from scipy.fftpack import fft,dct
+import time
+
+from matplotlib.patches import ConnectionPatch
+import matplotlib.pyplot as plt
+import numpy as np
+import scipy.spatial.distance as dist
+import pyaudio
+import wave
+def dp(distmat):
+    N,M = distmat.shape
+    # Initialisons the cost matrix
+    costmat =np.zeros((N+1,M+1))
+    for i in range (1,N+1):
+        costmat[i,0]=np.inf
+    for i in range (1,M+1):
+        costmat[0,i]=np.inf
+
+    for i in range (N):
+        for j in range (M):
+            #on calcule le cout minimal pour chaque chemin.pour atteindre the costmat[i][j] il y a trois chemins possibles on choisit celui de cout minimal
+            penalty = [
+              costmat[i,j],     # cas T==0
+              costmat[i,j+1] ,  # cas T==1
+              costmat[i+1,j]]   # cas T==2
+            ipenalty = np.argmin(penalty)
+            costmat[i+1,j+1] = distmat[i,j] + penalty[ipenalty]
+
+    #enlever les valeurs de l infini
+    costmat = costmat[1: , 1:]
+    return (costmat, costmat[-1, -1]/(N+M))
+
+def divsignalaudiobis(signal):
+    long_signal = 20 # 20 ms
+    recouvrement = 10 # 10 ms
+    long_echantillon = long_signal*sr//1000
+    recouvrement_echantillon = recouvrement*sr//1000
+    nb_echantillon = int(np.ceil((len(signal) - long_echantillon)/recouvrement_echantillon) + 1)
+    long_a_completer = recouvrement_echantillon*(nb_echantillon -1) + long_echantillon - len(signal)
+
+    if (long_a_completer != 0):
+        echantillon_data = np.pad(signal,(0,long_a_completer),mode='constant') # on complète le dernier échantillon par des 0
+
+    else :
+        nb_echantillon -= 1
+
+    echantillon_data = np.append(echantillon_data[0], echantillon_data[1:])
+    data = np.zeros((nb_echantillon, long_echantillon))
+    for i in range(nb_echantillon):
+        echantillon_i = echantillon_data[i*recouvrement_echantillon : i*recouvrement_echantillon + long_echantillon]
+        data[i,:] = echantillon_i
+    return data
+
+
+def myfft(signal, fe):
+    n= len(signal)
+    Te = 1/fe
+    S = [0 + 0j]*(450)
+    for l in range(50, 500):
+        f = l
+        for i in range (n):
+            t= Te * i
+            S[l-50] += signal[i]*np.exp(-2*math.pi*f*t*1j)
+        S[l-50] = abs(S[l-50])/n
+    return S
+
+def puissance_spec(signal):
+    amplitude_fft = np.absolute(signal)
+    return (amplitude_fft**2)/44100 # long fft = 512
+
+def BankFiltre(rate, puis_spec):
+    freq_min = 20
+    freq_max = rate//2
+    freq_mel_min = 1000*np.log2(1 +freq_min/1000)
+    freq_mel_max = 1000*np.log2(1+ freq_max/1000)
+    nb_filtre = 40 # on prend en général 40 filtres
+    mel_points = np.linspace(freq_mel_min, freq_mel_max, 42)
+    hz_points = 1000*(2**(mel_points/1000)-1)# on convertit en hz
+
+    bankf = np.zeros((nb_filtre, int(np.floor(22050 +1))))
+
+    for m in range(1, nb_filtre +1): # pour chaque filtre, on fait :
+        f_m_min = int(math.floor(hz_points[m-1]))   # point de gauche
+        f_m = int(math.floor(hz_points[m]))             # sommet
+        f_m_max = int(math.floor(hz_points[m+1]))  # point de droite
+
+        for k in range(f_m_min, f_m):
+            bankf[m - 1, k] =  (k - hz_points[m - 1]) / (hz_points[m] - hz_points[m - 1])
+        for k in range(f_m, f_m_max):
+            bankf[m - 1, k] =  ((hz_points[m + 1]) - k) / (hz_points[m + 1] - hz_points[m])
+
+    filter_bank = np.dot(puis_spec,np.transpose(bankf)) # Produit vectoriel/matriciel #ipdb
+    filter_bank = np.where(filter_bank == 0, np.finfo(float).eps, filter_bank) # attention à 0 dans le log.
+    return filter_bank
+
+def mfcc(signal, rate):
+    data = divsignalaudiobis(signal)
+    data_fft = np.fft.rfft(data, 44100)
+    data_puiss = puissance_spec(data_fft)
+    data_filtre = BankFiltre(rate, data_puiss)
+    pre_mfcc = np.log(data_filtre)
+    mfcc = dct(pre_mfcc, type=2, axis=1, norm="ortho")[:, 0 : 13] # on ne garde que les 13 premiers
+    #return mfcc
+    return mfcc
+def calculate_dtw_cost(mfccs_query , mfccs_train):
+    distmat = dist.cdist(mfccs_query, mfccs_train,"cosine")
+    costmat,mincost = dp(distmat)
+    return mincost
+def recognize_speech(audio_query, audio_train_list, sr):#sr frequence d echantillonnage
+    # Calculate MFCCs for query audio
+    mfccs_query = mfcc(audio_query, sr)
+
+    # Calculate DTW cost for each audio in training data
+    dtw_costs = []
+    for audio_train in audio_train_list:
+        mfccs_train = mfcc(audio_train, sr)
+        mincost = calculate_dtw_cost(mfccs_query, mfccs_train)
+        dtw_costs.append(mincost)
+
+    # Find index of word with lowest DTW cost
+    index = np.argmin(dtw_costs)
+
+    # Return recognized word
+    return index
+
+# Example usage
+
+def record_audio(filename, duration, sr):
+    chunk = 1024
+    sample_format = pyaudio.paInt16
+    channels = 1
+    record_seconds = duration
+    filename = f"{filename}.wav"
+
+    p = pyaudio.PyAudio()
+
+    stream = p.open(format=sample_format,
+                    channels=channels,
+                    rate=sr,
+                    frames_per_buffer=chunk,
+                    input=True)
+
+    frames = []
+
+    print(f"Enregistrement en cours...")
+
+    for i in range(0, int(sr / chunk * record_seconds)):
+        data = stream.read(chunk)
+        frames.append(data)
+
+    stream.stop_stream()
+    stream.close()
+
+    p.terminate()
+
+    print("Enregistrement terminé")
+
+    wf = wave.open(filename, "wb")
+    wf.setnchannels(channels)
+    wf.setsampwidth(p.get_sample_size(sample_format))
+    wf.setframerate(sr)
+    wf.writeframes(b"".join(frames))
+    wf.close()
+
+    print(f"Fichier enregistré sous {filename}")
+
+def coupe_silence(signal):
+    t = 0
+    if signal[t] == 0 :
+        p = 0
+        while signal[t+p] == 0 :
+            if p == 88 :
+                signal = signal[:t] + signal[t+p:]
+                coupe_silence(signal)
+            else :
+                p = p+1
+
+#Todo : detecte si pas de note donnée
+def get_grade():
+    ######## TEST DEBUG ########
+    time.sleep(6)
+    return 5
+
+
+    sr = 44100  # fréquence d'échantillonnage
+    duration = 6  # durée d'enregistrement en secondes
+    filename = "recording"  # nom du fichier à enregistrer
+    data_dir = "audio_data/"
+    record_audio(filename, duration, sr)
+    audio_query, sr = librosa.load(f'{filename}.wav', sr=sr)
+    coupe_silence(audio_query)
+    training_file_names = []
+    for path in os.listdir(data_dir):
+        if os.path.isfile(os.path.join(data_dir, path)):
+            training_file_names.append(data_dir + path)
+    print(training_file_names)
+    audio_train_list = [librosa.load(file, sr=sr)[0] for file in training_file_names]
+    recognized_word_index = recognize_speech(audio_query, audio_train_list, sr)
+    print(f'Recognized word: {recognized_word_index}')
+    return recognized_word_index
+
+print(get_grade())
--- a/code/backend_reconnaissance/manager.py
+++ b/code/backend_reconnaissance/manager.py
@@ -1,4 +1,5 @@
 from hand_detector import HandDetector
+from audio_detector import get_grade
 from network import WebsocketServer
 import time

@@ -6,11 +7,13 @@ import time
 class Manager():
    def __init__(self):
        self.state = 0
-        self.avis = {
+        self.defualtAvis = {
            "note": None,
            "commentaire": None,
            "notes_autres": {}
        }
+
+        self.avis = self.defualtAvis
        self.server = WebsocketServer(None)
        self.server.start()
        self.handDetector = HandDetector()
@@ -23,15 +26,29 @@ class Manager():
                self.sleep()
            if(self.state == 1):
                self.camera()
-
+            if(self.state == 2):
+                self.audio()
+            if(self.state == 3):
+                self.thankYou()
            time.sleep(0.01)

    #Fonction qui est executée pendant que la borne est en veille, reveille la borne si une main est detectée
    def sleep(self):
        res = self.handDetector.detect()
+        print(res)
        if(res != False):
            self.state = 1
            self.server.sendMessage({"type": "state", "state": 1})
+    
+    def audio(self):
+        grade = get_grade()
+        if(grade != False):
+            self.server.sendMessage({"type":"new_grade","grade":grade})
+            self.avis["notes_autres"]["test"] = grade
+            time.sleep(3)
+            self.state = 2
+            self.server.sendMessage({"type": "state", "state": 3})
+

    #Envoie la position de la main a l'écran et passe a l'étape suivante si une main est detectée pendant assez longtemps
    def camera(self):
@@ -44,5 +61,10 @@ class Manager():
                self.state = 2
                self.server.sendMessage({"type": "state", "state": 2})

-        
+    def thankYou(self):
+        time.sleep(10)
+        self.state = 0
+        self.server.sendMessage({"type": "state", "state": 0})
+        self.sendReview()
+        self.avis = self.defualtAvis 

--- a/code/backend_reconnaissance/network.py
+++ b/code/backend_reconnaissance/network.py
@@ -28,4 +28,10 @@ class WebsocketServer(threading.Thread):
            await asyncio.sleep(0.01)

    def sendMessage(self,message):
-        self.messageQueue.append(message)
+        self.messageQueue.append(message)
+
+class ApiClient():
+    def __init__(self, host=os.getenv("API_HOST"), port=os.getenv("API_PORT")):
+        self.host = host
+        self.port = port
+        
--- a/code/backend_reconnaissance/recording.wav
+++ b/code/backend_reconnaissance/recording.wav
--- a/code/backend_reconnaissance/requirements.txt
+++ b/code/backend_reconnaissance/requirements.txt
@@ -2,4 +2,7 @@ websockets
 requests
 opencv-python
 mediapipe
-numpy
+numpy
+pyaudio
+librosa
+scipy
--- a/code/docker-compose.yaml
+++ b/code/docker-compose.yaml
@@ -39,6 +39,8 @@ services:
  #API de gestion des avis, permet d'ajouter ou de récuperer des avis ou les stats sur les avis par des requêtes HTTP
  reviews_api:
    container_name: reviews_api
+    expose:
+      - 8080
    ports:
      - 8080:8080
    environment:
@@ -73,11 +75,11 @@ services:
      - 800:80

  #Formulaire de retour d'avis
-  Formulaire:
+  formulaire:
    image: httpd:latest
    volumes:
-      - ./Formulaire:/usr/local/apache2/htdocs/
-    container_name: Formulaire
+      - ./formulaire:/usr/local/apache2/htdocs/
+    container_name: formulaire
    ports:
      - 80:80
  # #Backend de la borne : scripts pythons de reconnaissances video et audio
@@ -92,6 +94,8 @@ services:
    environment:
      - PORT=5000
      - HOST=backend_reconnaissance
+      - API_HOST=reviews_api
+      - API_PORT=8080
    ports:
      #Ce container est le serveur websocker dont le client est l'interface de la borne qui tourne dans le navigateur
      - 5000:5000
--- a/code/interface_borne/assets/css/main.css
+++ b/code/interface_borne/assets/css/main.css
@@ -48,4 +48,9 @@ html, body {

 .instructions > .title {
    border-bottom: 3px #6B8000 solid;
+} 
+
+.instructions > table, .instructions > th,.instructions > td {
+  border: 1px solid #6B8000; 
+  border-collapse: collapse;
 }
--- a/code/interface_borne/assets/js/audio_page.js
+++ b/code/interface_borne/assets/js/audio_page.js
@@ -8,4 +8,10 @@ class AudioPage {
        this.isEnabled = isEnabled;
        this.DOMElement.style.display = isEnabled ? "block" : "none";
    }
+
+    setGrade(grade) {
+        if(this.isEnabled) {
+            this.DOMElement.getElementById("grade").innerHTML = grade.toString();
+        }
+    }
 }
--- a/code/interface_borne/assets/js/network.js
+++ b/code/interface_borne/assets/js/network.js
@@ -1,5 +1,5 @@
 class WebsocketClient {
-    constructor(onNewEffects, onNewGrade, onNewState) {
+    constructor(onNewEffects, onNewState, onNewGrade) {
        this.socket = new WebSocket("ws://localhost:5000");
        this.socket.addEventListener("open", (event)  => {
            this.socket.send("connected");
--- a/code/interface_borne/assets/js/state_manager.js
+++ b/code/interface_borne/assets/js/state_manager.js
@@ -18,7 +18,8 @@ class StateManager {
                this.setState(STATE.video);
                this._cameraPage.setEffects(effects)
            },
-            (state) => this.setState(state)
+            (state) => this.setState(state),
+            (grade) => this._audioPage.setGrade(grade)
        );
        
        this._sleepingPage.enabled = true;
--- a/code/interface_borne/index.html
+++ b/code/interface_borne/index.html
@@ -35,7 +35,7 @@
            <div class="title">
                <h1>Enregistrement audio blabal</h1>
            </div>
-            <p>Prononcez à voix haute les notes correspondant aux critères suivants dans l'ordre</p>
+            <p>Donnez une note sur 10 au critère suivant</p>
            <table>
                <tr>
                    <th>Critère</td>
@@ -43,7 +43,7 @@
                </tr>
                <tr>
                    <td>Calme</td>
-                    <td> /10</td>
+                    <td> <span id="grade"></span>/10</td>
                </tr>
            </table>
        </div>