ajout fonctions de base pour parse des données et les analyser

2026-02-09 10:40:17 +01:00 · 2024-05-01 23:30:31 +02:00
parent 239aa49d7f
commit e6cfd468a6
10 changed files with 167653 additions and 0 deletions
--- a/pycache/analyze.cpython-310.pyc
+++ b/pycache/analyze.cpython-310.pyc
--- a/pycache/generate_data.cpython-310.pyc
+++ b/pycache/generate_data.cpython-310.pyc
--- a/pycache/poll.cpython-310.pyc
+++ b/pycache/poll.cpython-310.pyc
--- a/analyze.py
+++ b/analyze.py
@@ -0,0 +1,24 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 def distribution_of_differences(df, column_name):
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
        raise ValueError(f"The column '{column_name}' does not exist in the DataFrame.")
    # Calculate differences between consecutive rows for the specified column
    differences = df[column_name].diff().abs()
    # The first element of differences will be NaN since there's no previous element for the first row
    differences = differences.dropna()  # Remove NaN values
    return differences
 def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differences"):
    plt.figure(figsize=(8, 4))  # Set the figure size for better readability
    plt.hist(data_series, bins=bins, color='blue', alpha=0.7, edgecolor='black')
    plt.title(title)
    plt.xlabel('Absolute Difference')
    plt.ylabel('Frequency')
    plt.grid(True)
    plt.show()
--- a/datasets/greenhouse.csv
+++ b/datasets/greenhouse.csv
--- a/env/.gitignore
+++ b/env/.gitignore
@@ -0,0 +1,12 @@
 # Virtualenv
 # http://iamzed.com/2009/05/07/a-primer-on-virtualenv/
 .Python
 [Bb]in
 [Ii]nclude
 [Ll]ib
 [Ll]ib64
 [Ll]ocal
 [Ss]cripts
 pyvenv.cfg
 .venv
 pip-selfcheck.json
--- a/generate_data.py
+++ b/generate_data.py
@@ -0,0 +1,52 @@
 import pandas as pd
 import numpy as np
 from opensimplex import OpenSimplex
 import datetime
 def generate_greenhouse_data(filepath):
    # Read the CSV file into a DataFrame, parsing 'time' as datetime
    df = pd.read_csv(filepath, parse_dates=["time"], dtype={"id": str, "value": float})
    # Compute the absolute differences between consecutive temperature readings
    df['diff'] = df['value'].diff().abs()
    # Initial value for 'diff' will be NaN; we can fill it with 0 or a small number
    df['diff'] = df['diff'].fillna(0)
    # Filter the DataFrame:
    # 1. Exclude temperature values that are too high (>50) or too low (<-10)
    # 2. Exclude rows where the difference from the previous reading is greater than 6
    filtered_df = df[(df['value'] > 0) & (df['value'] < 50) & (df['diff'] <= 6)]
    # Drop the 'diff' column as it's no longer needed after filtering
    filtered_df = filtered_df.drop(columns=['diff'])
    return filtered_df
 def generate_simplex(start_time=None, end_time=None, interval=600, max_temp=30, min_temp=10, frequency=10):
    # Default time settings if none provided
    if end_time is None:
        end_time = datetime.datetime.now()
    if start_time is None:
        start_time = end_time - datetime.timedelta(days=1)
    # Calculate the number of samples needed based on the interval
    total_seconds = int((end_time - start_time).total_seconds())
    steps = total_seconds // interval
    # Time array
    times = [start_time + datetime.timedelta(seconds=i * interval) for i in range(steps + 1)]
    # Simplex noise generator
    simplex = OpenSimplex(seed=np.random.randint(0, 1000))
    # Generate noise values and scale them
    temperatures = [simplex.noise2(x=i / frequency, y=0) for i in range(steps + 1)]
    # Map Simplex noise output (usually in range [-1, 1]) to the [min_temp, max_temp]
    scaled_temperatures = min_temp + (np.array(temperatures) + 1) / 2 * (max_temp - min_temp)
    # Create DataFrame
    df = pd.DataFrame({'time': times, 'value': scaled_temperatures})
    return df
--- a/main.py
+++ b/main.py
@@ -0,0 +1,36 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 from generate_data import *
 from analyze import *
 from poll import *
 def plot_temperature_data(df, recent_count=None):
    plt.figure(figsize=(10, 5))
    # Check if recent_count is specified and valid
    if recent_count is not None and recent_count > 0:
        df = df.tail(recent_count)  # Slice the DataFrame to get the last 'recent_count' rows
    plt.plot(df['time'], df['value'], label='Temperature', color='tab:red')
    plt.title('Temperature Over Time')
    plt.xlabel('Time')
    plt.ylabel('Temperature (°C)')
    plt.grid(True)
    plt.legend()
    plt.xticks(rotation=45)  # Rotates the x-axis labels to make them more readable
    plt.tight_layout()  # Adjusts subplot params so that the subplot(s) fits in to the figure area.
    plt.show()
 # Load the data from the CSV file
 df  = generate_greenhouse_data("datasets/greenhouse.csv")
 plot_temperature_data(df)
 df2 = sample_every_kth_point(df,50)
 diff1 = distribution_of_differences(df, 'value')
 diff2 = distribution_of_differences(df2, 'value')
 diff1 = diff1[diff1 <= 10]
 diff2 = diff2[diff2 <= 10]
 plot_histogram(diff1,bins=20, title='Distribution of Absolute Differences (Original Data)')
 plot_histogram(diff2, bins=20, title='Distribution of Absolute Differences (Sampled Data)')
--- a/poll.py
+++ b/poll.py
@@ -0,0 +1,10 @@
 def sample_every_kth_point(df, k):
    # Validate the input to ensure k is positive and does not exceed the DataFrame length
    if k <= 0:
        raise ValueError("k must be a positive integer.")
    if k > len(df):
        raise ValueError("k is greater than the number of rows in the DataFrame.")
    # Sample every k-th point
    sampled_df = df.iloc[::k]
    return sampled_df
--- a/requirements.txt
+++ b/requirements.txt
@@ -0,0 +1,2 @@
 pandas
 noise