From a9a9777aefd7a9f0f3c2e316a522d2cfe04d51e5 Mon Sep 17 00:00:00 2001
From: Quentin Roussel <contact@quentinrsl.com>
Date: Mon, 6 May 2024 21:48:50 +0200
Subject: [PATCH] finished compare plot

---
 analyze.py |  57 +++++++++++++--
 main.py    | 203 +++++++++++++++++++++++++++++++++++++++++++++++++----
 poll.py    |  49 ++++++++++++-
 3 files changed, 289 insertions(+), 20 deletions(-)

diff --git a/analyze.py b/analyze.py
index 6be15b7..31cb08d 100644
--- a/analyze.py
+++ b/analyze.py
@@ -1,18 +1,26 @@
 import pandas as pd
 import matplotlib.pyplot as plt
 
-def distribution_of_differences(df, column_name):
+def error(df, df_original, column_name):
+    diff = []
     # Check if the column exists in the DataFrame
     if column_name not in df.columns:
         raise ValueError(f"The column '{column_name}' does not exist in the DataFrame.")
+
+    def last_value_before(timestamp):
+        if df[df['time'] <= timestamp].empty:
+            raise ValueError("No point before the date")
+        return df[df['time'] <= timestamp].iloc[-1]
     
-    # Calculate differences between consecutive rows for the specified column
-    differences = df[column_name].diff().abs()
+    for i in range(1, len(df_original)):
+        try:
+            diff.append(abs(df_original["value"].iloc[i] - last_value_before(df_original["time"].iloc[i])["value"]))
+        except ValueError:
+                    continue
+
+    return diff
 
-    # The first element of differences will be NaN since there's no previous element for the first row
-    differences = differences.dropna()  # Remove NaN values
 
-    return differences
 
 def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differences"):
     plt.figure(figsize=(8, 4))  # Set the figure size for better readability
@@ -22,3 +30,40 @@ def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differe
     plt.ylabel('Frequency')
     plt.grid(True)
     plt.show()
+
+def compute_efficiency(df):
+    #compute the time differnece between the first and last point
+    time_diff = df["time"].iloc[-1] - df["time"].iloc[0]
+    #compute the number of points
+    num_points = len(df)
+    #compute the efficiency
+    efficiency = time_diff.total_seconds() / num_points
+    return efficiency
+
+def hourly_rate_of_change(df):
+    # Check if required columns exist
+    if 'time' not in df.columns or 'value' not in df.columns:
+        raise ValueError("DataFrame must include 'time' and 'value' columns.")
+    
+    # Check if the DataFrame is empty
+    if df.empty:
+        raise ValueError("The DataFrame is empty.")
+    
+    # Ensure 'time' is of datetime type
+    if not pd.api.types.is_datetime64_any_dtype(df['time']):
+        raise ValueError("'time' column must be of datetime type.")
+
+    # Calculate the difference between consecutive entries
+    df['time_diff'] = df['time'].diff().dt.total_seconds() / 3600  # Convert time difference to hours
+    df['value_diff'] = df['value'].diff()
+
+    # Calculate the rate of change in degrees per hour, and take the absolute value
+    df['rate_of_change'] = (df['value_diff'] / df['time_diff']).abs()
+
+    # Extract the hour from each datetime
+    df['hour'] = df['time'].dt.hour
+
+    # Group by hour and calculate the average absolute rate of change for each hour
+    hourly_avg_abs_rate = df.groupby('hour')['rate_of_change'].mean()
+
+    return hourly_avg_abs_rate
diff --git a/main.py b/main.py
index ee8ce97..f452c78 100644
--- a/main.py
+++ b/main.py
@@ -4,15 +4,18 @@ from generate_data import *
 from analyze import *
 from poll import *
 
+# sort two lists based on the first list
+def sort(X,Y):
+    return zip(*sorted(zip(X,Y)))
+
 def plot_temperature_data(df, recent_count=None):
-    plt.figure(figsize=(10, 5))
+    plt.figure(figsize=(5, 5))
     
     # Check if recent_count is specified and valid
     if recent_count is not None and recent_count > 0:
         df = df.tail(recent_count)  # Slice the DataFrame to get the last 'recent_count' rows
     
-    plt.plot(df['time'], df['value'], label='Temperature', color='tab:red')
-    plt.title('Temperature Over Time')
+    plt.plot(df['time'], df['value'], label='Temperature', color='tab:red', marker='x')
     plt.xlabel('Time')
     plt.ylabel('Temperature (°C)')
     plt.grid(True)
@@ -21,16 +24,190 @@ def plot_temperature_data(df, recent_count=None):
     plt.tight_layout()  # Adjusts subplot params so that the subplot(s) fits in to the figure area.
     plt.show()
 
-# Load the data from the CSV file
-df  = generate_greenhouse_data("datasets/greenhouse.csv")
-plot_temperature_data(df)
-df2 = sample_every_kth_point(df,50)
+def test_sample_every_kth_point(df):
+    X = np.arange(1, 10, 1)
+    MEAN = []
+    STD = []
+    MEDIAN = []
+    EFFICIENCY = []
+    for x in X:
+        print(x)
+        df_sampled = sample_every_kth_point(df, int(x))
+        # plot_temperature_data(df)
 
-diff1 = distribution_of_differences(df, 'value')
-diff2 = distribution_of_differences(df2, 'value')
+        diff = error(df_sampled, df, 'value')
 
-diff1 = diff1[diff1 <= 10]
-diff2 = diff2[diff2 <= 10]
+        MEAN.append(np.mean(diff))
+        STD.append(np.std(diff))
+        MEDIAN.append(np.median(diff))
+        EFFICIENCY.append(compute_efficiency(df_sampled))
+    
+    return X, EFFICIENCY, MEAN, MEDIAN, STD
 
-plot_histogram(diff1,bins=20, title='Distribution of Absolute Differences (Original Data)')
-plot_histogram(diff2, bins=20, title='Distribution of Absolute Differences (Sampled Data)')
+
+def example_sample_every_kth_point(k=10):
+    df  = generate_greenhouse_data("datasets/greenhouse.csv")
+    df = df.tail(150)
+    df = sample_every_kth_point(df, k)
+    plot_temperature_data(df)
+
+def example_sample_reglin():
+    df  = generate_greenhouse_data("datasets/greenhouse.csv")
+    df = df.tail(150)
+    df = sample_reglin(df)
+    plot_temperature_data(df)
+
+def exaample_optimal_sample(dT = 0.3):
+    df  = generate_greenhouse_data("datasets/greenhouse.csv")
+    df = df.tail(150)
+    df = optimal_sample(df, threshold_dT=dT)
+    plot_temperature_data(df)
+
+def example_sample_avg_rate_of_change():
+    df  = generate_greenhouse_data("datasets/greenhouse.csv")
+    hroc = hourly_rate_of_change(df)
+    df = df.tail(150)
+    df = sample_avg_rate_of_change(df, 3600 * 1 / hroc)
+    plot_temperature_data(df)
+
+def test_sample_reglin(df):
+    X = np.arange(0.4, 3, 0.05)
+    MEAN = []
+    STD = []
+    MEDIAN = []
+    EFFICIENCY = []
+    for x in X:
+        print(x)
+        df_sampled = sample_reglin(df, max_dT=x)
+        # plot_temperature_data(df)
+
+        diff = error(df_sampled, df, 'value')
+
+        MEAN.append(np.mean(diff))
+        STD.append(np.std(diff))
+        MEDIAN.append(np.median(diff))
+        EFFICIENCY.append(compute_efficiency(df_sampled))
+    return X, EFFICIENCY, MEAN, MEDIAN, STD
+
+
+def test_optimal_sample(df):
+    X = np.arange(0.1, 3, 0.05)
+    MEAN = []
+    STD = []
+    MEDIAN = []
+    EFFICIENCY = []
+    for x in X:
+        print(x)
+        df_sampeld= optimal_sample(df, threshold_dT=x)
+        # plot_temperature_data(df)
+
+        diff = error(df_sampeld,df, 'value')
+
+        MEAN.append(np.mean(diff))
+        STD.append(np.std(diff))
+        MEDIAN.append(np.median(diff))
+        EFFICIENCY.append(compute_efficiency(df_sampeld))
+    return X, EFFICIENCY, MEAN, MEDIAN, STD
+
+def test_sample_avg_rate_of_change(df,hourly_rate_of_change):
+    X = np.arange(0.01, 3, 0.05)
+    MEAN = []
+    STD = []
+    MEDIAN = []
+    EFFICIENCY = []
+    for x in X:
+        print(x)
+        df_sampled = sample_avg_rate_of_change(df, 3600 * x / hourly_rate_of_change)
+        # plot_temperature_data(df)
+
+        diff = error(df_sampled, df, 'value')
+
+        MEAN.append(np.mean(diff))
+        STD.append(np.std(diff))
+        MEDIAN.append(np.median(diff))
+        EFFICIENCY.append(compute_efficiency(df_sampled))
+    return X, EFFICIENCY, MEAN, MEDIAN, STD
+
+def comparaison_mean(df,limit=1000):
+    plt.figure(figsize=(10, 5))
+    hroc = hourly_rate_of_change(df)
+    df = df.tail(limit)
+    X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_every_kth_point(df)
+    MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
+    plt.plot( MEAN,EFFICIENCY, label="Constant Polling Interval", marker='x')
+
+    X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_reglin(df)
+    MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
+    plt.plot( MEAN,EFFICIENCY, label="Linear Regression", marker='x')
+
+    X, EFFICIENCY, MEAN, MEDIAN, STD = test_optimal_sample(df)
+    MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
+    plt.plot( MEAN,EFFICIENCY, label="Optimal Polling rate", marker='x')
+
+    X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_avg_rate_of_change(df,hroc)
+    MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
+    plt.plot( MEAN,EFFICIENCY, label="Hourly Rate of Change", marker='x')
+
+    plt.ylabel("Average seconds between polls")
+    plt.xlabel("Average error")
+    plt.ylim(0, 8000)
+    plt.xlim(0,1.3)
+
+    plt.legend()
+    plt.show()
+
+def example_optimal_sample(dT = 0.3):
+    df  = generate_greenhouse_data("datasets/greenhouse.csv")
+    df = df.tail(1000)
+    df = optimal_sample(df, threshold_dT=dT)
+    plt.plot(df['time'], df['value'], label='Temperature', color='tab:red', marker='x')
+    plt.title('Temperature Over Time')
+    plt.xlabel('Time')
+    plt.ylabel('Temperature (°C)')
+    plt.grid(True)
+    plt.legend()
+    plt.show()
+
+def histogram_sample_every_kth_point(k=10):
+    df  = generate_greenhouse_data("datasets/greenhouse.csv")
+    df = df.tail(1000)
+    df_sampled = sample_every_kth_point(df, k)
+    diff = error(df, df_sampled, 'value')
+    plot_histogram(diff)
+
+# histogram_sample_every_kth_point(1)
+# df  = generate_greenhouse_data("datasets/greenhouse.csv")
+# df = df.tail(1000)
+
+#Comparaison of the mean error with simplex
+# df = generate_simplex(interval=600, frequency=10)
+# plt.plot(df['time'], df['value'], label='Temperature', color='tab:red', marker='x')
+# plt.show()
+# comparaison_mean(df)
+#Same thing with the greenhouse data
+# df = generate_greenhouse_data("datasets/greenhouse.csv")
+# df = df.tail(1000)
+# plt.plot(df['time'], df['value'], label='Temperature', color='tab:red', marker='x')
+# plt.show()
+# comparaison_mean(df)
+
+# Temperature rate of change over the day
+# df = generate_greenhouse_data("datasets/greenhouse.csv")
+# hcor = hourly_rate_of_change(df)
+# hcor.plot()
+# plt.xlabel("Hour of the day")
+# plt.ylabel("Average absolute rate of change (°C/hour)")
+# plt.show()
+# plt.ylabel("Aboslute rate of change of the temperature (°C/hour)")
+# plt.xlabel("Hour of the day")
+# plt.show()
+
+df = generate_greenhouse_data("datasets/greenhouse.csv")
+comparaison_mean(df, 1000)
+
+# example_sample_every_kth_point(1)
+# example_sample_every_kth_point(10)
+# exaample_optimal_sample()
+# example_sample_reglin()
+# example_sample_avg_rate_of_change()
+    # Calculate differences between consecutive rows for the specified column
\ No newline at end of file
diff --git a/poll.py b/poll.py
index 13262e8..7dd61ff 100644
--- a/poll.py
+++ b/poll.py
@@ -1,3 +1,6 @@
+import datetime
+from analyze import hourly_rate_of_change
+
 def sample_every_kth_point(df, k):
     # Validate the input to ensure k is positive and does not exceed the DataFrame length
     if k <= 0:
@@ -7,4 +10,48 @@ def sample_every_kth_point(df, k):
 
     # Sample every k-th point
     sampled_df = df.iloc[::k]
-    return sampled_df
\ No newline at end of file
+    return sampled_df
+
+def optimal_sample(df, threshold_dT=0.5):
+    t0 = df["time"].iloc[0]
+    indices = [0]
+    times = [t0]
+    for i in range(1, len(df)):
+        dT = abs(df["value"].iloc[i] - df["value"].iloc[indices[-1]])
+        if dT > threshold_dT:
+            times.append(i)
+            indices.append(i)
+    return df.iloc[indices]
+        
+def sample_reglin(df,max_dT=0.5, max_poll_interval=2 * 3600):
+    indices = []
+    def get_first_point_after(date):
+        if(df[df['time'] > date].empty):
+            raise ValueError("No point before the date")
+        return df[df['time'] > date].iloc[0]
+    # Get first two points
+    t0 = df["time"].iloc[0]
+    t1 = df["time"].iloc[1]
+    while True:
+        v0 = df[df["time"] == t0]["value"].values[0]
+        v1 = df[df["time"] == t1]["value"].values[0]
+        # Calculate the slope
+        s = abs((v1 - v0) / (t1 - t0).total_seconds())
+        #add max_dT/s to t1
+        new_t = t1 + datetime.timedelta(seconds=min(max_dT/s, max_poll_interval))
+        try:
+            new_t = get_first_point_after(new_t)["time"]
+            indices.append(df[df["time"] == new_t].index[0])
+            t0 = t1
+            t1 = new_t
+        except ValueError:
+            break
+    return df.loc[indices]
+
+def sample_avg_rate_of_change(df,poll_rate):
+    indices = [0]
+    for i in range(len(df)):
+        current_hour = df["time"].iloc[i].hour
+        if(df["time"].iloc[i] - df["time"].iloc[indices[-1]] > datetime.timedelta(seconds = poll_rate.iloc[current_hour])):
+            indices.append(i)
+    return df.iloc[indices]