From a9a9777aefd7a9f0f3c2e316a522d2cfe04d51e5 Mon Sep 17 00:00:00 2001 From: Quentin Roussel Date: Mon, 6 May 2024 21:48:50 +0200 Subject: [PATCH] finished compare plot --- analyze.py | 57 +++++++++++++-- main.py | 203 +++++++++++++++++++++++++++++++++++++++++++++++++---- poll.py | 49 ++++++++++++- 3 files changed, 289 insertions(+), 20 deletions(-) diff --git a/analyze.py b/analyze.py index 6be15b7..31cb08d 100644 --- a/analyze.py +++ b/analyze.py @@ -1,18 +1,26 @@ import pandas as pd import matplotlib.pyplot as plt -def distribution_of_differences(df, column_name): +def error(df, df_original, column_name): + diff = [] # Check if the column exists in the DataFrame if column_name not in df.columns: raise ValueError(f"The column '{column_name}' does not exist in the DataFrame.") + + def last_value_before(timestamp): + if df[df['time'] <= timestamp].empty: + raise ValueError("No point before the date") + return df[df['time'] <= timestamp].iloc[-1] - # Calculate differences between consecutive rows for the specified column - differences = df[column_name].diff().abs() + for i in range(1, len(df_original)): + try: + diff.append(abs(df_original["value"].iloc[i] - last_value_before(df_original["time"].iloc[i])["value"])) + except ValueError: + continue + + return diff - # The first element of differences will be NaN since there's no previous element for the first row - differences = differences.dropna() # Remove NaN values - return differences def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differences"): plt.figure(figsize=(8, 4)) # Set the figure size for better readability @@ -22,3 +30,40 @@ def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differe plt.ylabel('Frequency') plt.grid(True) plt.show() + +def compute_efficiency(df): + #compute the time differnece between the first and last point + time_diff = df["time"].iloc[-1] - df["time"].iloc[0] + #compute the number of points + num_points = len(df) + #compute the efficiency + efficiency = time_diff.total_seconds() / num_points + return efficiency + +def hourly_rate_of_change(df): + # Check if required columns exist + if 'time' not in df.columns or 'value' not in df.columns: + raise ValueError("DataFrame must include 'time' and 'value' columns.") + + # Check if the DataFrame is empty + if df.empty: + raise ValueError("The DataFrame is empty.") + + # Ensure 'time' is of datetime type + if not pd.api.types.is_datetime64_any_dtype(df['time']): + raise ValueError("'time' column must be of datetime type.") + + # Calculate the difference between consecutive entries + df['time_diff'] = df['time'].diff().dt.total_seconds() / 3600 # Convert time difference to hours + df['value_diff'] = df['value'].diff() + + # Calculate the rate of change in degrees per hour, and take the absolute value + df['rate_of_change'] = (df['value_diff'] / df['time_diff']).abs() + + # Extract the hour from each datetime + df['hour'] = df['time'].dt.hour + + # Group by hour and calculate the average absolute rate of change for each hour + hourly_avg_abs_rate = df.groupby('hour')['rate_of_change'].mean() + + return hourly_avg_abs_rate diff --git a/main.py b/main.py index ee8ce97..f452c78 100644 --- a/main.py +++ b/main.py @@ -4,15 +4,18 @@ from generate_data import * from analyze import * from poll import * +# sort two lists based on the first list +def sort(X,Y): + return zip(*sorted(zip(X,Y))) + def plot_temperature_data(df, recent_count=None): - plt.figure(figsize=(10, 5)) + plt.figure(figsize=(5, 5)) # Check if recent_count is specified and valid if recent_count is not None and recent_count > 0: df = df.tail(recent_count) # Slice the DataFrame to get the last 'recent_count' rows - plt.plot(df['time'], df['value'], label='Temperature', color='tab:red') - plt.title('Temperature Over Time') + plt.plot(df['time'], df['value'], label='Temperature', color='tab:red', marker='x') plt.xlabel('Time') plt.ylabel('Temperature (°C)') plt.grid(True) @@ -21,16 +24,190 @@ def plot_temperature_data(df, recent_count=None): plt.tight_layout() # Adjusts subplot params so that the subplot(s) fits in to the figure area. plt.show() -# Load the data from the CSV file -df = generate_greenhouse_data("datasets/greenhouse.csv") -plot_temperature_data(df) -df2 = sample_every_kth_point(df,50) +def test_sample_every_kth_point(df): + X = np.arange(1, 10, 1) + MEAN = [] + STD = [] + MEDIAN = [] + EFFICIENCY = [] + for x in X: + print(x) + df_sampled = sample_every_kth_point(df, int(x)) + # plot_temperature_data(df) -diff1 = distribution_of_differences(df, 'value') -diff2 = distribution_of_differences(df2, 'value') + diff = error(df_sampled, df, 'value') -diff1 = diff1[diff1 <= 10] -diff2 = diff2[diff2 <= 10] + MEAN.append(np.mean(diff)) + STD.append(np.std(diff)) + MEDIAN.append(np.median(diff)) + EFFICIENCY.append(compute_efficiency(df_sampled)) + + return X, EFFICIENCY, MEAN, MEDIAN, STD -plot_histogram(diff1,bins=20, title='Distribution of Absolute Differences (Original Data)') -plot_histogram(diff2, bins=20, title='Distribution of Absolute Differences (Sampled Data)') + +def example_sample_every_kth_point(k=10): + df = generate_greenhouse_data("datasets/greenhouse.csv") + df = df.tail(150) + df = sample_every_kth_point(df, k) + plot_temperature_data(df) + +def example_sample_reglin(): + df = generate_greenhouse_data("datasets/greenhouse.csv") + df = df.tail(150) + df = sample_reglin(df) + plot_temperature_data(df) + +def exaample_optimal_sample(dT = 0.3): + df = generate_greenhouse_data("datasets/greenhouse.csv") + df = df.tail(150) + df = optimal_sample(df, threshold_dT=dT) + plot_temperature_data(df) + +def example_sample_avg_rate_of_change(): + df = generate_greenhouse_data("datasets/greenhouse.csv") + hroc = hourly_rate_of_change(df) + df = df.tail(150) + df = sample_avg_rate_of_change(df, 3600 * 1 / hroc) + plot_temperature_data(df) + +def test_sample_reglin(df): + X = np.arange(0.4, 3, 0.05) + MEAN = [] + STD = [] + MEDIAN = [] + EFFICIENCY = [] + for x in X: + print(x) + df_sampled = sample_reglin(df, max_dT=x) + # plot_temperature_data(df) + + diff = error(df_sampled, df, 'value') + + MEAN.append(np.mean(diff)) + STD.append(np.std(diff)) + MEDIAN.append(np.median(diff)) + EFFICIENCY.append(compute_efficiency(df_sampled)) + return X, EFFICIENCY, MEAN, MEDIAN, STD + + +def test_optimal_sample(df): + X = np.arange(0.1, 3, 0.05) + MEAN = [] + STD = [] + MEDIAN = [] + EFFICIENCY = [] + for x in X: + print(x) + df_sampeld= optimal_sample(df, threshold_dT=x) + # plot_temperature_data(df) + + diff = error(df_sampeld,df, 'value') + + MEAN.append(np.mean(diff)) + STD.append(np.std(diff)) + MEDIAN.append(np.median(diff)) + EFFICIENCY.append(compute_efficiency(df_sampeld)) + return X, EFFICIENCY, MEAN, MEDIAN, STD + +def test_sample_avg_rate_of_change(df,hourly_rate_of_change): + X = np.arange(0.01, 3, 0.05) + MEAN = [] + STD = [] + MEDIAN = [] + EFFICIENCY = [] + for x in X: + print(x) + df_sampled = sample_avg_rate_of_change(df, 3600 * x / hourly_rate_of_change) + # plot_temperature_data(df) + + diff = error(df_sampled, df, 'value') + + MEAN.append(np.mean(diff)) + STD.append(np.std(diff)) + MEDIAN.append(np.median(diff)) + EFFICIENCY.append(compute_efficiency(df_sampled)) + return X, EFFICIENCY, MEAN, MEDIAN, STD + +def comparaison_mean(df,limit=1000): + plt.figure(figsize=(10, 5)) + hroc = hourly_rate_of_change(df) + df = df.tail(limit) + X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_every_kth_point(df) + MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY) + plt.plot( MEAN,EFFICIENCY, label="Constant Polling Interval", marker='x') + + X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_reglin(df) + MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY) + plt.plot( MEAN,EFFICIENCY, label="Linear Regression", marker='x') + + X, EFFICIENCY, MEAN, MEDIAN, STD = test_optimal_sample(df) + MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY) + plt.plot( MEAN,EFFICIENCY, label="Optimal Polling rate", marker='x') + + X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_avg_rate_of_change(df,hroc) + MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY) + plt.plot( MEAN,EFFICIENCY, label="Hourly Rate of Change", marker='x') + + plt.ylabel("Average seconds between polls") + plt.xlabel("Average error") + plt.ylim(0, 8000) + plt.xlim(0,1.3) + + plt.legend() + plt.show() + +def example_optimal_sample(dT = 0.3): + df = generate_greenhouse_data("datasets/greenhouse.csv") + df = df.tail(1000) + df = optimal_sample(df, threshold_dT=dT) + plt.plot(df['time'], df['value'], label='Temperature', color='tab:red', marker='x') + plt.title('Temperature Over Time') + plt.xlabel('Time') + plt.ylabel('Temperature (°C)') + plt.grid(True) + plt.legend() + plt.show() + +def histogram_sample_every_kth_point(k=10): + df = generate_greenhouse_data("datasets/greenhouse.csv") + df = df.tail(1000) + df_sampled = sample_every_kth_point(df, k) + diff = error(df, df_sampled, 'value') + plot_histogram(diff) + +# histogram_sample_every_kth_point(1) +# df = generate_greenhouse_data("datasets/greenhouse.csv") +# df = df.tail(1000) + +#Comparaison of the mean error with simplex +# df = generate_simplex(interval=600, frequency=10) +# plt.plot(df['time'], df['value'], label='Temperature', color='tab:red', marker='x') +# plt.show() +# comparaison_mean(df) +#Same thing with the greenhouse data +# df = generate_greenhouse_data("datasets/greenhouse.csv") +# df = df.tail(1000) +# plt.plot(df['time'], df['value'], label='Temperature', color='tab:red', marker='x') +# plt.show() +# comparaison_mean(df) + +# Temperature rate of change over the day +# df = generate_greenhouse_data("datasets/greenhouse.csv") +# hcor = hourly_rate_of_change(df) +# hcor.plot() +# plt.xlabel("Hour of the day") +# plt.ylabel("Average absolute rate of change (°C/hour)") +# plt.show() +# plt.ylabel("Aboslute rate of change of the temperature (°C/hour)") +# plt.xlabel("Hour of the day") +# plt.show() + +df = generate_greenhouse_data("datasets/greenhouse.csv") +comparaison_mean(df, 1000) + +# example_sample_every_kth_point(1) +# example_sample_every_kth_point(10) +# exaample_optimal_sample() +# example_sample_reglin() +# example_sample_avg_rate_of_change() + # Calculate differences between consecutive rows for the specified column \ No newline at end of file diff --git a/poll.py b/poll.py index 13262e8..7dd61ff 100644 --- a/poll.py +++ b/poll.py @@ -1,3 +1,6 @@ +import datetime +from analyze import hourly_rate_of_change + def sample_every_kth_point(df, k): # Validate the input to ensure k is positive and does not exceed the DataFrame length if k <= 0: @@ -7,4 +10,48 @@ def sample_every_kth_point(df, k): # Sample every k-th point sampled_df = df.iloc[::k] - return sampled_df \ No newline at end of file + return sampled_df + +def optimal_sample(df, threshold_dT=0.5): + t0 = df["time"].iloc[0] + indices = [0] + times = [t0] + for i in range(1, len(df)): + dT = abs(df["value"].iloc[i] - df["value"].iloc[indices[-1]]) + if dT > threshold_dT: + times.append(i) + indices.append(i) + return df.iloc[indices] + +def sample_reglin(df,max_dT=0.5, max_poll_interval=2 * 3600): + indices = [] + def get_first_point_after(date): + if(df[df['time'] > date].empty): + raise ValueError("No point before the date") + return df[df['time'] > date].iloc[0] + # Get first two points + t0 = df["time"].iloc[0] + t1 = df["time"].iloc[1] + while True: + v0 = df[df["time"] == t0]["value"].values[0] + v1 = df[df["time"] == t1]["value"].values[0] + # Calculate the slope + s = abs((v1 - v0) / (t1 - t0).total_seconds()) + #add max_dT/s to t1 + new_t = t1 + datetime.timedelta(seconds=min(max_dT/s, max_poll_interval)) + try: + new_t = get_first_point_after(new_t)["time"] + indices.append(df[df["time"] == new_t].index[0]) + t0 = t1 + t1 = new_t + except ValueError: + break + return df.loc[indices] + +def sample_avg_rate_of_change(df,poll_rate): + indices = [0] + for i in range(len(df)): + current_hour = df["time"].iloc[i].hour + if(df["time"].iloc[i] - df["time"].iloc[indices[-1]] > datetime.timedelta(seconds = poll_rate.iloc[current_hour])): + indices.append(i) + return df.iloc[indices]