mirror of
https://git.roussel.pro/telecom-paris/GIN206.git
synced 2026-02-09 10:40:17 +01:00
finished compare plot
This commit is contained in:
57
analyze.py
57
analyze.py
@@ -1,18 +1,26 @@
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import matplotlib.pyplot as plt
|
import matplotlib.pyplot as plt
|
||||||
|
|
||||||
def distribution_of_differences(df, column_name):
|
def error(df, df_original, column_name):
|
||||||
|
diff = []
|
||||||
# Check if the column exists in the DataFrame
|
# Check if the column exists in the DataFrame
|
||||||
if column_name not in df.columns:
|
if column_name not in df.columns:
|
||||||
raise ValueError(f"The column '{column_name}' does not exist in the DataFrame.")
|
raise ValueError(f"The column '{column_name}' does not exist in the DataFrame.")
|
||||||
|
|
||||||
# Calculate differences between consecutive rows for the specified column
|
def last_value_before(timestamp):
|
||||||
differences = df[column_name].diff().abs()
|
if df[df['time'] <= timestamp].empty:
|
||||||
|
raise ValueError("No point before the date")
|
||||||
|
return df[df['time'] <= timestamp].iloc[-1]
|
||||||
|
|
||||||
|
for i in range(1, len(df_original)):
|
||||||
|
try:
|
||||||
|
diff.append(abs(df_original["value"].iloc[i] - last_value_before(df_original["time"].iloc[i])["value"]))
|
||||||
|
except ValueError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
return diff
|
||||||
|
|
||||||
# The first element of differences will be NaN since there's no previous element for the first row
|
|
||||||
differences = differences.dropna() # Remove NaN values
|
|
||||||
|
|
||||||
return differences
|
|
||||||
|
|
||||||
def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differences"):
|
def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differences"):
|
||||||
plt.figure(figsize=(8, 4)) # Set the figure size for better readability
|
plt.figure(figsize=(8, 4)) # Set the figure size for better readability
|
||||||
@@ -22,3 +30,40 @@ def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differe
|
|||||||
plt.ylabel('Frequency')
|
plt.ylabel('Frequency')
|
||||||
plt.grid(True)
|
plt.grid(True)
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
|
def compute_efficiency(df):
|
||||||
|
#compute the time differnece between the first and last point
|
||||||
|
time_diff = df["time"].iloc[-1] - df["time"].iloc[0]
|
||||||
|
#compute the number of points
|
||||||
|
num_points = len(df)
|
||||||
|
#compute the efficiency
|
||||||
|
efficiency = time_diff.total_seconds() / num_points
|
||||||
|
return efficiency
|
||||||
|
|
||||||
|
def hourly_rate_of_change(df):
|
||||||
|
# Check if required columns exist
|
||||||
|
if 'time' not in df.columns or 'value' not in df.columns:
|
||||||
|
raise ValueError("DataFrame must include 'time' and 'value' columns.")
|
||||||
|
|
||||||
|
# Check if the DataFrame is empty
|
||||||
|
if df.empty:
|
||||||
|
raise ValueError("The DataFrame is empty.")
|
||||||
|
|
||||||
|
# Ensure 'time' is of datetime type
|
||||||
|
if not pd.api.types.is_datetime64_any_dtype(df['time']):
|
||||||
|
raise ValueError("'time' column must be of datetime type.")
|
||||||
|
|
||||||
|
# Calculate the difference between consecutive entries
|
||||||
|
df['time_diff'] = df['time'].diff().dt.total_seconds() / 3600 # Convert time difference to hours
|
||||||
|
df['value_diff'] = df['value'].diff()
|
||||||
|
|
||||||
|
# Calculate the rate of change in degrees per hour, and take the absolute value
|
||||||
|
df['rate_of_change'] = (df['value_diff'] / df['time_diff']).abs()
|
||||||
|
|
||||||
|
# Extract the hour from each datetime
|
||||||
|
df['hour'] = df['time'].dt.hour
|
||||||
|
|
||||||
|
# Group by hour and calculate the average absolute rate of change for each hour
|
||||||
|
hourly_avg_abs_rate = df.groupby('hour')['rate_of_change'].mean()
|
||||||
|
|
||||||
|
return hourly_avg_abs_rate
|
||||||
|
|||||||
205
main.py
205
main.py
@@ -4,15 +4,18 @@ from generate_data import *
|
|||||||
from analyze import *
|
from analyze import *
|
||||||
from poll import *
|
from poll import *
|
||||||
|
|
||||||
|
# sort two lists based on the first list
|
||||||
|
def sort(X,Y):
|
||||||
|
return zip(*sorted(zip(X,Y)))
|
||||||
|
|
||||||
def plot_temperature_data(df, recent_count=None):
|
def plot_temperature_data(df, recent_count=None):
|
||||||
plt.figure(figsize=(10, 5))
|
plt.figure(figsize=(5, 5))
|
||||||
|
|
||||||
# Check if recent_count is specified and valid
|
# Check if recent_count is specified and valid
|
||||||
if recent_count is not None and recent_count > 0:
|
if recent_count is not None and recent_count > 0:
|
||||||
df = df.tail(recent_count) # Slice the DataFrame to get the last 'recent_count' rows
|
df = df.tail(recent_count) # Slice the DataFrame to get the last 'recent_count' rows
|
||||||
|
|
||||||
plt.plot(df['time'], df['value'], label='Temperature', color='tab:red')
|
plt.plot(df['time'], df['value'], label='Temperature', color='tab:red', marker='x')
|
||||||
plt.title('Temperature Over Time')
|
|
||||||
plt.xlabel('Time')
|
plt.xlabel('Time')
|
||||||
plt.ylabel('Temperature (°C)')
|
plt.ylabel('Temperature (°C)')
|
||||||
plt.grid(True)
|
plt.grid(True)
|
||||||
@@ -21,16 +24,190 @@ def plot_temperature_data(df, recent_count=None):
|
|||||||
plt.tight_layout() # Adjusts subplot params so that the subplot(s) fits in to the figure area.
|
plt.tight_layout() # Adjusts subplot params so that the subplot(s) fits in to the figure area.
|
||||||
plt.show()
|
plt.show()
|
||||||
|
|
||||||
# Load the data from the CSV file
|
def test_sample_every_kth_point(df):
|
||||||
|
X = np.arange(1, 10, 1)
|
||||||
|
MEAN = []
|
||||||
|
STD = []
|
||||||
|
MEDIAN = []
|
||||||
|
EFFICIENCY = []
|
||||||
|
for x in X:
|
||||||
|
print(x)
|
||||||
|
df_sampled = sample_every_kth_point(df, int(x))
|
||||||
|
# plot_temperature_data(df)
|
||||||
|
|
||||||
|
diff = error(df_sampled, df, 'value')
|
||||||
|
|
||||||
|
MEAN.append(np.mean(diff))
|
||||||
|
STD.append(np.std(diff))
|
||||||
|
MEDIAN.append(np.median(diff))
|
||||||
|
EFFICIENCY.append(compute_efficiency(df_sampled))
|
||||||
|
|
||||||
|
return X, EFFICIENCY, MEAN, MEDIAN, STD
|
||||||
|
|
||||||
|
|
||||||
|
def example_sample_every_kth_point(k=10):
|
||||||
|
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
||||||
|
df = df.tail(150)
|
||||||
|
df = sample_every_kth_point(df, k)
|
||||||
|
plot_temperature_data(df)
|
||||||
|
|
||||||
|
def example_sample_reglin():
|
||||||
|
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
||||||
|
df = df.tail(150)
|
||||||
|
df = sample_reglin(df)
|
||||||
|
plot_temperature_data(df)
|
||||||
|
|
||||||
|
def exaample_optimal_sample(dT = 0.3):
|
||||||
|
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
||||||
|
df = df.tail(150)
|
||||||
|
df = optimal_sample(df, threshold_dT=dT)
|
||||||
|
plot_temperature_data(df)
|
||||||
|
|
||||||
|
def example_sample_avg_rate_of_change():
|
||||||
|
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
||||||
|
hroc = hourly_rate_of_change(df)
|
||||||
|
df = df.tail(150)
|
||||||
|
df = sample_avg_rate_of_change(df, 3600 * 1 / hroc)
|
||||||
|
plot_temperature_data(df)
|
||||||
|
|
||||||
|
def test_sample_reglin(df):
|
||||||
|
X = np.arange(0.4, 3, 0.05)
|
||||||
|
MEAN = []
|
||||||
|
STD = []
|
||||||
|
MEDIAN = []
|
||||||
|
EFFICIENCY = []
|
||||||
|
for x in X:
|
||||||
|
print(x)
|
||||||
|
df_sampled = sample_reglin(df, max_dT=x)
|
||||||
|
# plot_temperature_data(df)
|
||||||
|
|
||||||
|
diff = error(df_sampled, df, 'value')
|
||||||
|
|
||||||
|
MEAN.append(np.mean(diff))
|
||||||
|
STD.append(np.std(diff))
|
||||||
|
MEDIAN.append(np.median(diff))
|
||||||
|
EFFICIENCY.append(compute_efficiency(df_sampled))
|
||||||
|
return X, EFFICIENCY, MEAN, MEDIAN, STD
|
||||||
|
|
||||||
|
|
||||||
|
def test_optimal_sample(df):
|
||||||
|
X = np.arange(0.1, 3, 0.05)
|
||||||
|
MEAN = []
|
||||||
|
STD = []
|
||||||
|
MEDIAN = []
|
||||||
|
EFFICIENCY = []
|
||||||
|
for x in X:
|
||||||
|
print(x)
|
||||||
|
df_sampeld= optimal_sample(df, threshold_dT=x)
|
||||||
|
# plot_temperature_data(df)
|
||||||
|
|
||||||
|
diff = error(df_sampeld,df, 'value')
|
||||||
|
|
||||||
|
MEAN.append(np.mean(diff))
|
||||||
|
STD.append(np.std(diff))
|
||||||
|
MEDIAN.append(np.median(diff))
|
||||||
|
EFFICIENCY.append(compute_efficiency(df_sampeld))
|
||||||
|
return X, EFFICIENCY, MEAN, MEDIAN, STD
|
||||||
|
|
||||||
|
def test_sample_avg_rate_of_change(df,hourly_rate_of_change):
|
||||||
|
X = np.arange(0.01, 3, 0.05)
|
||||||
|
MEAN = []
|
||||||
|
STD = []
|
||||||
|
MEDIAN = []
|
||||||
|
EFFICIENCY = []
|
||||||
|
for x in X:
|
||||||
|
print(x)
|
||||||
|
df_sampled = sample_avg_rate_of_change(df, 3600 * x / hourly_rate_of_change)
|
||||||
|
# plot_temperature_data(df)
|
||||||
|
|
||||||
|
diff = error(df_sampled, df, 'value')
|
||||||
|
|
||||||
|
MEAN.append(np.mean(diff))
|
||||||
|
STD.append(np.std(diff))
|
||||||
|
MEDIAN.append(np.median(diff))
|
||||||
|
EFFICIENCY.append(compute_efficiency(df_sampled))
|
||||||
|
return X, EFFICIENCY, MEAN, MEDIAN, STD
|
||||||
|
|
||||||
|
def comparaison_mean(df,limit=1000):
|
||||||
|
plt.figure(figsize=(10, 5))
|
||||||
|
hroc = hourly_rate_of_change(df)
|
||||||
|
df = df.tail(limit)
|
||||||
|
X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_every_kth_point(df)
|
||||||
|
MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
|
||||||
|
plt.plot( MEAN,EFFICIENCY, label="Constant Polling Interval", marker='x')
|
||||||
|
|
||||||
|
X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_reglin(df)
|
||||||
|
MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
|
||||||
|
plt.plot( MEAN,EFFICIENCY, label="Linear Regression", marker='x')
|
||||||
|
|
||||||
|
X, EFFICIENCY, MEAN, MEDIAN, STD = test_optimal_sample(df)
|
||||||
|
MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
|
||||||
|
plt.plot( MEAN,EFFICIENCY, label="Optimal Polling rate", marker='x')
|
||||||
|
|
||||||
|
X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_avg_rate_of_change(df,hroc)
|
||||||
|
MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
|
||||||
|
plt.plot( MEAN,EFFICIENCY, label="Hourly Rate of Change", marker='x')
|
||||||
|
|
||||||
|
plt.ylabel("Average seconds between polls")
|
||||||
|
plt.xlabel("Average error")
|
||||||
|
plt.ylim(0, 8000)
|
||||||
|
plt.xlim(0,1.3)
|
||||||
|
|
||||||
|
plt.legend()
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
def example_optimal_sample(dT = 0.3):
|
||||||
|
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
||||||
|
df = df.tail(1000)
|
||||||
|
df = optimal_sample(df, threshold_dT=dT)
|
||||||
|
plt.plot(df['time'], df['value'], label='Temperature', color='tab:red', marker='x')
|
||||||
|
plt.title('Temperature Over Time')
|
||||||
|
plt.xlabel('Time')
|
||||||
|
plt.ylabel('Temperature (°C)')
|
||||||
|
plt.grid(True)
|
||||||
|
plt.legend()
|
||||||
|
plt.show()
|
||||||
|
|
||||||
|
def histogram_sample_every_kth_point(k=10):
|
||||||
|
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
||||||
|
df = df.tail(1000)
|
||||||
|
df_sampled = sample_every_kth_point(df, k)
|
||||||
|
diff = error(df, df_sampled, 'value')
|
||||||
|
plot_histogram(diff)
|
||||||
|
|
||||||
|
# histogram_sample_every_kth_point(1)
|
||||||
|
# df = generate_greenhouse_data("datasets/greenhouse.csv")
|
||||||
|
# df = df.tail(1000)
|
||||||
|
|
||||||
|
#Comparaison of the mean error with simplex
|
||||||
|
# df = generate_simplex(interval=600, frequency=10)
|
||||||
|
# plt.plot(df['time'], df['value'], label='Temperature', color='tab:red', marker='x')
|
||||||
|
# plt.show()
|
||||||
|
# comparaison_mean(df)
|
||||||
|
#Same thing with the greenhouse data
|
||||||
|
# df = generate_greenhouse_data("datasets/greenhouse.csv")
|
||||||
|
# df = df.tail(1000)
|
||||||
|
# plt.plot(df['time'], df['value'], label='Temperature', color='tab:red', marker='x')
|
||||||
|
# plt.show()
|
||||||
|
# comparaison_mean(df)
|
||||||
|
|
||||||
|
# Temperature rate of change over the day
|
||||||
|
# df = generate_greenhouse_data("datasets/greenhouse.csv")
|
||||||
|
# hcor = hourly_rate_of_change(df)
|
||||||
|
# hcor.plot()
|
||||||
|
# plt.xlabel("Hour of the day")
|
||||||
|
# plt.ylabel("Average absolute rate of change (°C/hour)")
|
||||||
|
# plt.show()
|
||||||
|
# plt.ylabel("Aboslute rate of change of the temperature (°C/hour)")
|
||||||
|
# plt.xlabel("Hour of the day")
|
||||||
|
# plt.show()
|
||||||
|
|
||||||
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
df = generate_greenhouse_data("datasets/greenhouse.csv")
|
||||||
plot_temperature_data(df)
|
comparaison_mean(df, 1000)
|
||||||
df2 = sample_every_kth_point(df,50)
|
|
||||||
|
|
||||||
diff1 = distribution_of_differences(df, 'value')
|
# example_sample_every_kth_point(1)
|
||||||
diff2 = distribution_of_differences(df2, 'value')
|
# example_sample_every_kth_point(10)
|
||||||
|
# exaample_optimal_sample()
|
||||||
diff1 = diff1[diff1 <= 10]
|
# example_sample_reglin()
|
||||||
diff2 = diff2[diff2 <= 10]
|
# example_sample_avg_rate_of_change()
|
||||||
|
# Calculate differences between consecutive rows for the specified column
|
||||||
plot_histogram(diff1,bins=20, title='Distribution of Absolute Differences (Original Data)')
|
|
||||||
plot_histogram(diff2, bins=20, title='Distribution of Absolute Differences (Sampled Data)')
|
|
||||||
47
poll.py
47
poll.py
@@ -1,3 +1,6 @@
|
|||||||
|
import datetime
|
||||||
|
from analyze import hourly_rate_of_change
|
||||||
|
|
||||||
def sample_every_kth_point(df, k):
|
def sample_every_kth_point(df, k):
|
||||||
# Validate the input to ensure k is positive and does not exceed the DataFrame length
|
# Validate the input to ensure k is positive and does not exceed the DataFrame length
|
||||||
if k <= 0:
|
if k <= 0:
|
||||||
@@ -8,3 +11,47 @@ def sample_every_kth_point(df, k):
|
|||||||
# Sample every k-th point
|
# Sample every k-th point
|
||||||
sampled_df = df.iloc[::k]
|
sampled_df = df.iloc[::k]
|
||||||
return sampled_df
|
return sampled_df
|
||||||
|
|
||||||
|
def optimal_sample(df, threshold_dT=0.5):
|
||||||
|
t0 = df["time"].iloc[0]
|
||||||
|
indices = [0]
|
||||||
|
times = [t0]
|
||||||
|
for i in range(1, len(df)):
|
||||||
|
dT = abs(df["value"].iloc[i] - df["value"].iloc[indices[-1]])
|
||||||
|
if dT > threshold_dT:
|
||||||
|
times.append(i)
|
||||||
|
indices.append(i)
|
||||||
|
return df.iloc[indices]
|
||||||
|
|
||||||
|
def sample_reglin(df,max_dT=0.5, max_poll_interval=2 * 3600):
|
||||||
|
indices = []
|
||||||
|
def get_first_point_after(date):
|
||||||
|
if(df[df['time'] > date].empty):
|
||||||
|
raise ValueError("No point before the date")
|
||||||
|
return df[df['time'] > date].iloc[0]
|
||||||
|
# Get first two points
|
||||||
|
t0 = df["time"].iloc[0]
|
||||||
|
t1 = df["time"].iloc[1]
|
||||||
|
while True:
|
||||||
|
v0 = df[df["time"] == t0]["value"].values[0]
|
||||||
|
v1 = df[df["time"] == t1]["value"].values[0]
|
||||||
|
# Calculate the slope
|
||||||
|
s = abs((v1 - v0) / (t1 - t0).total_seconds())
|
||||||
|
#add max_dT/s to t1
|
||||||
|
new_t = t1 + datetime.timedelta(seconds=min(max_dT/s, max_poll_interval))
|
||||||
|
try:
|
||||||
|
new_t = get_first_point_after(new_t)["time"]
|
||||||
|
indices.append(df[df["time"] == new_t].index[0])
|
||||||
|
t0 = t1
|
||||||
|
t1 = new_t
|
||||||
|
except ValueError:
|
||||||
|
break
|
||||||
|
return df.loc[indices]
|
||||||
|
|
||||||
|
def sample_avg_rate_of_change(df,poll_rate):
|
||||||
|
indices = [0]
|
||||||
|
for i in range(len(df)):
|
||||||
|
current_hour = df["time"].iloc[i].hour
|
||||||
|
if(df["time"].iloc[i] - df["time"].iloc[indices[-1]] > datetime.timedelta(seconds = poll_rate.iloc[current_hour])):
|
||||||
|
indices.append(i)
|
||||||
|
return df.iloc[indices]
|
||||||
|
|||||||
Reference in New Issue
Block a user