ajout fonctions de base pour parse des données et les analyser

2026-02-09 02:30:17 +01:00 · 2024-05-01 23:30:31 +02:00
parent 239aa49d7f
commit e6cfd468a6
10 changed files with 167653 additions and 0 deletions
--- a/pycache/analyze.cpython-310.pyc
+++ b/pycache/analyze.cpython-310.pyc
--- a/pycache/generate_data.cpython-310.pyc
+++ b/pycache/generate_data.cpython-310.pyc
--- a/pycache/poll.cpython-310.pyc
+++ b/pycache/poll.cpython-310.pyc
--- a/analyze.py
+++ b/analyze.py
@@ -0,0 +1,24 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+
+def distribution_of_differences(df, column_name):
+    # Check if the column exists in the DataFrame
+    if column_name not in df.columns:
+        raise ValueError(f"The column '{column_name}' does not exist in the DataFrame.")
+    
+    # Calculate differences between consecutive rows for the specified column
+    differences = df[column_name].diff().abs()
+
+    # The first element of differences will be NaN since there's no previous element for the first row
+    differences = differences.dropna()  # Remove NaN values
+
+    return differences
+
+def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differences"):
+    plt.figure(figsize=(8, 4))  # Set the figure size for better readability
+    plt.hist(data_series, bins=bins, color='blue', alpha=0.7, edgecolor='black')
+    plt.title(title)
+    plt.xlabel('Absolute Difference')
+    plt.ylabel('Frequency')
+    plt.grid(True)
+    plt.show()
--- a/datasets/greenhouse.csv
+++ b/datasets/greenhouse.csv
--- a/env/.gitignore
+++ b/env/.gitignore
@@ -0,0 +1,12 @@
+# Virtualenv
+# http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
+.Python
+[Bb]in
+[Ii]nclude
+[Ll]ib
+[Ll]ib64
+[Ll]ocal
+[Ss]cripts
+pyvenv.cfg
+.venv
+pip-selfcheck.json
--- a/generate_data.py
+++ b/generate_data.py
@@ -0,0 +1,52 @@
+import pandas as pd
+import numpy as np
+from opensimplex import OpenSimplex
+import datetime
+
+def generate_greenhouse_data(filepath):
+    # Read the CSV file into a DataFrame, parsing 'time' as datetime
+    df = pd.read_csv(filepath, parse_dates=["time"], dtype={"id": str, "value": float})
+    
+    # Compute the absolute differences between consecutive temperature readings
+    df['diff'] = df['value'].diff().abs()
+    
+    # Initial value for 'diff' will be NaN; we can fill it with 0 or a small number
+    df['diff'] = df['diff'].fillna(0)
+    
+    # Filter the DataFrame:
+    # 1. Exclude temperature values that are too high (>50) or too low (<-10)
+    # 2. Exclude rows where the difference from the previous reading is greater than 6
+    filtered_df = df[(df['value'] > 0) & (df['value'] < 50) & (df['diff'] <= 6)]
+    
+    # Drop the 'diff' column as it's no longer needed after filtering
+    filtered_df = filtered_df.drop(columns=['diff'])
+    
+    return filtered_df
+
+
+def generate_simplex(start_time=None, end_time=None, interval=600, max_temp=30, min_temp=10, frequency=10):
+    # Default time settings if none provided
+    if end_time is None:
+        end_time = datetime.datetime.now()
+    if start_time is None:
+        start_time = end_time - datetime.timedelta(days=1)
+    
+    # Calculate the number of samples needed based on the interval
+    total_seconds = int((end_time - start_time).total_seconds())
+    steps = total_seconds // interval
+    
+    # Time array
+    times = [start_time + datetime.timedelta(seconds=i * interval) for i in range(steps + 1)]
+    
+    # Simplex noise generator
+    simplex = OpenSimplex(seed=np.random.randint(0, 1000))
+    
+    # Generate noise values and scale them
+    temperatures = [simplex.noise2(x=i / frequency, y=0) for i in range(steps + 1)]
+    
+    # Map Simplex noise output (usually in range [-1, 1]) to the [min_temp, max_temp]
+    scaled_temperatures = min_temp + (np.array(temperatures) + 1) / 2 * (max_temp - min_temp)
+    
+    # Create DataFrame
+    df = pd.DataFrame({'time': times, 'value': scaled_temperatures})
+    return df
--- a/main.py
+++ b/main.py
@@ -0,0 +1,36 @@
+import pandas as pd
+import matplotlib.pyplot as plt
+from generate_data import *
+from analyze import *
+from poll import *
+
+def plot_temperature_data(df, recent_count=None):
+    plt.figure(figsize=(10, 5))
+    
+    # Check if recent_count is specified and valid
+    if recent_count is not None and recent_count > 0:
+        df = df.tail(recent_count)  # Slice the DataFrame to get the last 'recent_count' rows
+    
+    plt.plot(df['time'], df['value'], label='Temperature', color='tab:red')
+    plt.title('Temperature Over Time')
+    plt.xlabel('Time')
+    plt.ylabel('Temperature (°C)')
+    plt.grid(True)
+    plt.legend()
+    plt.xticks(rotation=45)  # Rotates the x-axis labels to make them more readable
+    plt.tight_layout()  # Adjusts subplot params so that the subplot(s) fits in to the figure area.
+    plt.show()
+
+# Load the data from the CSV file
+df  = generate_greenhouse_data("datasets/greenhouse.csv")
+plot_temperature_data(df)
+df2 = sample_every_kth_point(df,50)
+
+diff1 = distribution_of_differences(df, 'value')
+diff2 = distribution_of_differences(df2, 'value')
+
+diff1 = diff1[diff1 <= 10]
+diff2 = diff2[diff2 <= 10]
+
+plot_histogram(diff1,bins=20, title='Distribution of Absolute Differences (Original Data)')
+plot_histogram(diff2, bins=20, title='Distribution of Absolute Differences (Sampled Data)')
--- a/poll.py
+++ b/poll.py
@@ -0,0 +1,10 @@
+def sample_every_kth_point(df, k):
+    # Validate the input to ensure k is positive and does not exceed the DataFrame length
+    if k <= 0:
+        raise ValueError("k must be a positive integer.")
+    if k > len(df):
+        raise ValueError("k is greater than the number of rows in the DataFrame.")
+
+    # Sample every k-th point
+    sampled_df = df.iloc[::k]
+    return sampled_df
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,2 @@
+pandas
+noise