added documentation

This commit is contained in:
Quentin Roussel
2024-05-07 00:36:41 +02:00
parent ac4d95bd07
commit 3fd37213e1
4 changed files with 322 additions and 24 deletions

View File

@@ -2,6 +2,21 @@ import pandas as pd
import matplotlib.pyplot as plt import matplotlib.pyplot as plt
def error(df, df_original, column_name): def error(df, df_original, column_name):
"""
Calculate the error between the values in a column of a DataFrame and the last value before each timestamp.
Args:
df (pandas.DataFrame): The DataFrame containing the values.
df_original (pandas.DataFrame): The original DataFrame containing the timestamps and values.
column_name (str): The name of the column to calculate the error for.
Returns:
list: A list of absolute differences between the values in the specified column and the last value before each timestamp.
Raises:
ValueError: If the specified column does not exist in the DataFrame.
"""
diff = [] diff = []
# Check if the column exists in the DataFrame # Check if the column exists in the DataFrame
if column_name not in df.columns: if column_name not in df.columns:
@@ -23,6 +38,19 @@ def error(df, df_original, column_name):
def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differences"): def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differences"):
"""
Plots a histogram of the given data series.
Parameters:
- data_series (array-like): The data series to plot the histogram for.
- bins (int): The number of bins to use for the histogram. Default is 10.
- title (str): The title of the histogram plot. Default is "Distribution of Absolute Differences".
Returns:
None
"""
import matplotlib.pyplot as plt
plt.figure(figsize=(8, 4)) # Set the figure size for better readability plt.figure(figsize=(8, 4)) # Set the figure size for better readability
plt.hist(data_series, bins=bins, color='blue', alpha=0.7, edgecolor='black') plt.hist(data_series, bins=bins, color='blue', alpha=0.7, edgecolor='black')
plt.title(title) plt.title(title)
@@ -32,15 +60,40 @@ def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differe
plt.show() plt.show()
def compute_efficiency(df): def compute_efficiency(df):
#compute the time differnece between the first and last point """
Compute the efficiency of a data frame. i.e the time taken to collect each data point.
Parameters:
df (pandas.DataFrame): The input data frame.
Returns:
float: The efficiency value.
"""
# compute the time difference between the first and last point
time_diff = df["time"].iloc[-1] - df["time"].iloc[0] time_diff = df["time"].iloc[-1] - df["time"].iloc[0]
#compute the number of points # compute the number of points
num_points = len(df) num_points = len(df)
#compute the efficiency # compute the efficiency
efficiency = time_diff.total_seconds() / num_points efficiency = time_diff.total_seconds() / num_points
return efficiency return efficiency
def hourly_rate_of_change(df): def hourly_rate_of_change(df):
"""
Calculate the average absolute rate of change per hour for a given DataFrame.
Args:
df (pandas.DataFrame): The DataFrame containing the data.
Returns:
pandas.Series: A Series containing the average absolute rate of change per hour.
Raises:
ValueError: If the DataFrame does not include 'time' and 'value' columns, or if it is empty.
ValueError: If the 'time' column is not of datetime type.
"""
# Check if required columns exist # Check if required columns exist
if 'time' not in df.columns or 'value' not in df.columns: if 'time' not in df.columns or 'value' not in df.columns:
raise ValueError("DataFrame must include 'time' and 'value' columns.") raise ValueError("DataFrame must include 'time' and 'value' columns.")

View File

@@ -4,6 +4,19 @@ from opensimplex import OpenSimplex
import datetime import datetime
def generate_greenhouse_data(filepath): def generate_greenhouse_data(filepath):
"""
Generate filtered greenhouse data from a CSV file.
Parameters:
filepath (str): The path to the CSV file.
Returns:
pandas.DataFrame: The filtered greenhouse data.
"""
# Rest of the code...
def generate_greenhouse_data(filepath):
# Read the CSV file into a DataFrame, parsing 'time' as datetime # Read the CSV file into a DataFrame, parsing 'time' as datetime
df = pd.read_csv(filepath, parse_dates=["time"], dtype={"id": str, "value": float}) df = pd.read_csv(filepath, parse_dates=["time"], dtype={"id": str, "value": float})
@@ -25,6 +38,21 @@ def generate_greenhouse_data(filepath):
def generate_simplex(start_time=None, end_time=None, interval=600, max_temp=30, min_temp=10, frequency=10): def generate_simplex(start_time=None, end_time=None, interval=600, max_temp=30, min_temp=10, frequency=10):
"""
Generate a DataFrame with time and temperature values using Simplex noise.
Parameters:
- start_time (datetime): The start time for generating the data. If not provided, it defaults to 1 day before the end_time.
- end_time (datetime): The end time for generating the data. If not provided, it defaults to the current time.
- interval (int): The time interval in seconds between each data point. Defaults to 600 seconds (10 minutes).
- max_temp (float): The maximum temperature value. Defaults to 30.
- min_temp (float): The minimum temperature value. Defaults to 10.
- frequency (int): The frequency parameter for the Simplex noise generator. Defaults to 10.
Returns:
- df (DataFrame): A pandas DataFrame with 'time' and 'value' columns representing the generated time and temperature values.
"""
# Default time settings if none provided # Default time settings if none provided
if end_time is None: if end_time is None:
end_time = datetime.datetime.now() end_time = datetime.datetime.now()

185
main.py
View File

@@ -4,11 +4,37 @@ from generate_data import *
from analyze import * from analyze import *
from poll import * from poll import *
# sort two lists based on the first list def sort(X, Y):
def sort(X,Y): """
return zip(*sorted(zip(X,Y))) Sorts two lists X and Y in ascending order based on the values in X.
Args:
X (list): The first list to be sorted.
Y (list): The second list to be sorted.
Returns:
tuple: A tuple containing the sorted X and Y lists.
Example:
X = [3, 1, 2]
Y = ['c', 'a', 'b']
sorted_X, sorted_Y = sort(X, Y)
# sorted_X: [1, 2, 3]
# sorted_Y: ['a', 'b', 'c']
"""
return zip(*sorted(zip(X, Y)))
def plot_temperature_data(df, recent_count=None): def plot_temperature_data(df, recent_count=None):
"""
Plots the temperature data from a DataFrame.
Args:
df (pandas.DataFrame): The DataFrame containing the temperature data.
recent_count (int, optional): The number of recent data points to plot. If specified, only the last 'recent_count' rows will be plotted. Defaults to None.
Returns:
None
"""
plt.figure(figsize=(5, 5)) plt.figure(figsize=(5, 5))
# Check if recent_count is specified and valid # Check if recent_count is specified and valid
@@ -25,6 +51,20 @@ def plot_temperature_data(df, recent_count=None):
plt.show() plt.show()
def test_sample_every_kth_point(df): def test_sample_every_kth_point(df):
"""
Test the sample_every_kth_point function with different values of k.
Parameters:
- df: The input DataFrame containing the data.
Returns:
- X: The array of values used for sampling.
- EFFICIENCY: The efficiency values for each sampling.
- MEAN: The mean error values for each sampling.
- MEDIAN: The median error values for each sampling.
- STD: The standard deviation of error values for each sampling.
"""
X = np.arange(1, 10, 1) X = np.arange(1, 10, 1)
MEAN = [] MEAN = []
STD = [] STD = []
@@ -46,24 +86,59 @@ def test_sample_every_kth_point(df):
def example_sample_every_kth_point(k=10): def example_sample_every_kth_point(k=10):
"""
Example function that demonstrates how to sample every kth point from a dataframe and plot the temperature data.
Parameters:
k (int): The sampling interval. Default is 10.
Returns:
None
"""
df = generate_greenhouse_data("datasets/greenhouse.csv") df = generate_greenhouse_data("datasets/greenhouse.csv")
df = df.tail(150) df = df.tail(150)
df = sample_every_kth_point(df, k) df = sample_every_kth_point(df, k)
plot_temperature_data(df) plot_temperature_data(df)
def example_sample_reglin(): def example_sample_reglin():
"""
This function demonstrates the usage of the sample_reglin function.
It generates greenhouse data, selects the last 150 rows, applies the sample_reglin function,
and plots the temperature data.
"""
df = generate_greenhouse_data("datasets/greenhouse.csv") df = generate_greenhouse_data("datasets/greenhouse.csv")
df = df.tail(150) df = df.tail(150)
df = sample_reglin(df) df = sample_reglin(df)
plot_temperature_data(df) plot_temperature_data(df)
def exaample_optimal_sample(dT = 0.3): def example_optimal_sample(dT = 0.3):
"""
Example function that demonstrates the usage of the optimal_sample function.
Parameters:
dT (float): The threshold value for temperature difference. Default is 0.3.
Returns:
None
"""
df = generate_greenhouse_data("datasets/greenhouse.csv") df = generate_greenhouse_data("datasets/greenhouse.csv")
df = df.tail(150) df = df.tail(150)
df = optimal_sample(df, threshold_dT=dT) df = optimal_sample(df, threshold_dT=dT)
plot_temperature_data(df) plot_temperature_data(df)
def example_sample_avg_rate_of_change(): def example_sample_avg_rate_of_change():
"""
This function demonstrates how to calculate the sample average rate of change for temperature data.
It generates greenhouse data, calculates the hourly rate of change, selects the last 150 records,
and then calculates the sample average rate of change based on the hourly rate of change.
Finally, it plots the temperature data.
Parameters:
None
Returns:
None
"""
df = generate_greenhouse_data("datasets/greenhouse.csv") df = generate_greenhouse_data("datasets/greenhouse.csv")
hroc = hourly_rate_of_change(df) hroc = hourly_rate_of_change(df)
df = df.tail(150) df = df.tail(150)
@@ -71,6 +146,25 @@ def example_sample_avg_rate_of_change():
plot_temperature_data(df) plot_temperature_data(df)
def test_sample_reglin(df): def test_sample_reglin(df):
"""
Perform a test on the sample_reglin function with different values of max_dT.
Parameters:
- df: DataFrame
The input DataFrame containing temperature data.
Returns:
- X: ndarray
An array of values ranging from 0.4 to 3 with a step of 0.05.
- EFFICIENCY: list
A list of efficiency values calculated for each max_dT value.
- MEAN: list
A list of mean error values calculated for each max_dT value.
- MEDIAN: list
A list of median error values calculated for each max_dT value.
- STD: list
A list of standard deviation error values calculated for each max_dT value.
"""
X = np.arange(0.4, 3, 0.05) X = np.arange(0.4, 3, 0.05)
MEAN = [] MEAN = []
STD = [] STD = []
@@ -91,6 +185,20 @@ def test_sample_reglin(df):
def test_optimal_sample(df): def test_optimal_sample(df):
"""
Test the optimal sample function with different threshold values.
Args:
df (pandas.DataFrame): The input DataFrame containing temperature data.
Returns:
tuple: A tuple containing the following lists:
- X (numpy.ndarray): An array of threshold values.
- EFFICIENCY (list): A list of efficiency values for each threshold.
- MEAN (list): A list of mean error values for each threshold.
- MEDIAN (list): A list of median error values for each threshold.
- STD (list): A list of standard deviation error values for each threshold.
"""
X = np.arange(0.1, 3, 0.05) X = np.arange(0.1, 3, 0.05)
MEAN = [] MEAN = []
STD = [] STD = []
@@ -109,7 +217,24 @@ def test_optimal_sample(df):
EFFICIENCY.append(compute_efficiency(df_sampeld)) EFFICIENCY.append(compute_efficiency(df_sampeld))
return X, EFFICIENCY, MEAN, MEDIAN, STD return X, EFFICIENCY, MEAN, MEDIAN, STD
def test_sample_avg_rate_of_change(df,hourly_rate_of_change): def test_sample_avg_rate_of_change(df, hourly_rate_of_change):
"""
Test the sample average rate of change.
This function takes a DataFrame `df` and the `hourly_rate_of_change` as input.
It performs a series of calculations on the data and returns the results.
Parameters:
- df (pandas.DataFrame): The input DataFrame containing the data.
- hourly_rate_of_change (float): The hourly rate of change.
Returns:
- X (numpy.ndarray): An array of values ranging from 0.01 to 3 with a step of 0.05.
- EFFICIENCY (list): A list of efficiency values calculated for each sample.
- MEAN (list): A list of mean values calculated for each sample.
- MEDIAN (list): A list of median values calculated for each sample.
- STD (list): A list of standard deviation values calculated for each sample.
"""
X = np.arange(0.01, 3, 0.05) X = np.arange(0.01, 3, 0.05)
MEAN = [] MEAN = []
STD = [] STD = []
@@ -128,35 +253,56 @@ def test_sample_avg_rate_of_change(df,hourly_rate_of_change):
EFFICIENCY.append(compute_efficiency(df_sampled)) EFFICIENCY.append(compute_efficiency(df_sampled))
return X, EFFICIENCY, MEAN, MEDIAN, STD return X, EFFICIENCY, MEAN, MEDIAN, STD
def comparaison_mean(df,limit=1000): def comparaison_mean(df, limit=1000):
"""
Compare different sampling methods based on their mean and efficiency.
Parameters:
- df: DataFrame
The input DataFrame containing the data.
- limit: int, optional
The number of rows to consider from the end of the DataFrame. Default is 1000.
Returns:
None
"""
plt.figure(figsize=(10, 5)) plt.figure(figsize=(10, 5))
hroc = hourly_rate_of_change(df) hroc = hourly_rate_of_change(df)
df = df.tail(limit) df = df.tail(limit)
X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_every_kth_point(df) X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_every_kth_point(df)
MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY) MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
plt.plot( MEAN,EFFICIENCY, label="Constant Polling Interval", marker='x') plt.plot(MEAN, EFFICIENCY, label="Constant Polling Interval", marker='x')
X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_reglin(df) X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_reglin(df)
MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY) MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
plt.plot( MEAN,EFFICIENCY, label="Linear Regression", marker='x') plt.plot(MEAN, EFFICIENCY, label="Linear Regression", marker='x')
X, EFFICIENCY, MEAN, MEDIAN, STD = test_optimal_sample(df) X, EFFICIENCY, MEAN, MEDIAN, STD = test_optimal_sample(df)
MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY) MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
plt.plot( MEAN,EFFICIENCY, label="Optimal Polling rate", marker='x') plt.plot(MEAN, EFFICIENCY, label="Optimal Polling rate", marker='x')
X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_avg_rate_of_change(df,hroc) X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_avg_rate_of_change(df, hroc)
MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY) MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY)
plt.plot( MEAN,EFFICIENCY, label="Hourly Rate of Change", marker='x') plt.plot(MEAN, EFFICIENCY, label="Hourly Rate of Change", marker='x')
plt.ylabel("Average seconds between polls") plt.ylabel("Average seconds between polls")
plt.xlabel("Average error") plt.xlabel("Average error")
plt.ylim(0, 8000) plt.ylim(0, 8000)
plt.xlim(0,1.3) plt.xlim(0, 1.3)
plt.legend() plt.legend()
plt.show() plt.show()
def example_optimal_sample(dT = 0.3): def example_optimal_sample(dT = 0.3):
"""
This function demonstrates how to use the `optimal_sample` function to generate an optimal sample of greenhouse data.
Parameters:
dT (float): The threshold value for temperature difference. Default is 0.3.
Returns:
None
"""
df = generate_greenhouse_data("datasets/greenhouse.csv") df = generate_greenhouse_data("datasets/greenhouse.csv")
df = df.tail(1000) df = df.tail(1000)
df = optimal_sample(df, threshold_dT=dT) df = optimal_sample(df, threshold_dT=dT)
@@ -169,6 +315,15 @@ def example_optimal_sample(dT = 0.3):
plt.show() plt.show()
def histogram_sample_every_kth_point(k=10): def histogram_sample_every_kth_point(k=10):
"""
Generate a histogram of the differences between the original data and the sampled data.
Parameters:
- k (int): The sampling interval. Only every kth point will be included in the sampled data.
Returns:
None
"""
df = generate_greenhouse_data("datasets/greenhouse.csv") df = generate_greenhouse_data("datasets/greenhouse.csv")
df = df.tail(1000) df = df.tail(1000)
df_sampled = sample_every_kth_point(df, k) df_sampled = sample_every_kth_point(df, k)
@@ -192,9 +347,9 @@ def histogram_sample_every_kth_point(k=10):
# comparaison_mean(df) # comparaison_mean(df)
# Temperature rate of change over the day # Temperature rate of change over the day
df = generate_greenhouse_data("datasets/greenhouse.csv") # df = generate_greenhouse_data("datasets/greenhouse.csv")
hcor = hourly_rate_of_change(df) # hcor = hourly_rate_of_change(df)
print(hcor) # print(hcor)
# hcor.plot() # hcor.plot()
# plt.xlabel("Hour of the day") # plt.xlabel("Hour of the day")
# plt.ylabel("Average absolute rate of change (°C/hour)") # plt.ylabel("Average absolute rate of change (°C/hour)")

74
poll.py
View File

@@ -2,6 +2,22 @@ import datetime
from analyze import hourly_rate_of_change from analyze import hourly_rate_of_change
def sample_every_kth_point(df, k): def sample_every_kth_point(df, k):
"""
Sample every k-th point from a DataFrame.
Parameters:
- df: pandas DataFrame
The DataFrame from which to sample the points.
- k: int
The interval between sampled points.
Returns:
- sampled_df: pandas DataFrame
The DataFrame containing the sampled points.
Raises:
- ValueError: If k is not a positive integer or if k exceeds the number of rows in the DataFrame.
"""
# Validate the input to ensure k is positive and does not exceed the DataFrame length # Validate the input to ensure k is positive and does not exceed the DataFrame length
if k <= 0: if k <= 0:
raise ValueError("k must be a positive integer.") raise ValueError("k must be a positive integer.")
@@ -13,6 +29,17 @@ def sample_every_kth_point(df, k):
return sampled_df return sampled_df
def optimal_sample(df, threshold_dT=0.5): def optimal_sample(df, threshold_dT=0.5):
"""
Returns a subset of the input DataFrame `df` containing rows that have a significant change in value.
Parameters:
df (pandas.DataFrame): The input DataFrame.
threshold_dT (float, optional): The threshold value for the change in value. Defaults to 0.5.
Returns:
pandas.DataFrame: A subset of the input DataFrame `df` containing rows with significant changes in value.
"""
t0 = df["time"].iloc[0] t0 = df["time"].iloc[0]
indices = [0] indices = [0]
times = [t0] times = [t0]
@@ -23,22 +50,45 @@ def optimal_sample(df, threshold_dT=0.5):
indices.append(i) indices.append(i)
return df.iloc[indices] return df.iloc[indices]
def sample_reglin(df,max_dT=0.5, max_poll_interval=2 * 3600): def sample_reglin(df, max_dT=0.5, max_poll_interval=2 * 3600):
"""
Returns a subset of the input DataFrame `df` by sampling points based on a linear regression algorithm.
Parameters:
- df (pandas.DataFrame): The input DataFrame containing the time series data.
- max_dT (float): The value difference that should be considered significant enough to add a new value.
Defaults to 0.5.
- max_poll_interval (int): The maximum time interval allowed between the first and last point in the subset.
Defaults to 2 hours (2 * 3600 seconds).
Returns:
- pandas.DataFrame: A subset of the input DataFrame `df` containing the sampled points.
Raises:
- ValueError: If there is no point before the specified date.
"""
indices = [] indices = []
def get_first_point_after(date): def get_first_point_after(date):
if(df[df['time'] > date].empty): if df[df['time'] > date].empty:
raise ValueError("No point before the date") raise ValueError("No point before the date")
return df[df['time'] > date].iloc[0] return df[df['time'] > date].iloc[0]
# Get first two points # Get first two points
t0 = df["time"].iloc[0] t0 = df["time"].iloc[0]
t1 = df["time"].iloc[1] t1 = df["time"].iloc[1]
while True: while True:
v0 = df[df["time"] == t0]["value"].values[0] v0 = df[df["time"] == t0]["value"].values[0]
v1 = df[df["time"] == t1]["value"].values[0] v1 = df[df["time"] == t1]["value"].values[0]
# Calculate the slope # Calculate the slope
s = abs((v1 - v0) / (t1 - t0).total_seconds()) s = abs((v1 - v0) / (t1 - t0).total_seconds())
#add max_dT/s to t1
new_t = t1 + datetime.timedelta(seconds=min(max_dT/s, max_poll_interval)) # Add max_dT/s to t1
new_t = t1 + datetime.timedelta(seconds=min(max_dT / s, max_poll_interval))
try: try:
new_t = get_first_point_after(new_t)["time"] new_t = get_first_point_after(new_t)["time"]
indices.append(df[df["time"] == new_t].index[0]) indices.append(df[df["time"] == new_t].index[0])
@@ -46,12 +96,24 @@ def sample_reglin(df,max_dT=0.5, max_poll_interval=2 * 3600):
t1 = new_t t1 = new_t
except ValueError: except ValueError:
break break
return df.loc[indices] return df.loc[indices]
def sample_avg_rate_of_change(df,poll_rate): def sample_avg_rate_of_change(df, poll_rate):
"""
Calculate the sample average rate of change for a given DataFrame.
Args:
df (pandas.DataFrame): The DataFrame containing the data.
poll_rate (pandas.Series): The Series containing the poll rates for each hour.
Returns:
pandas.DataFrame: The subset of the DataFrame with the indices where the rate of change exceeds the poll rate.
"""
indices = [0] indices = [0]
for i in range(len(df)): for i in range(len(df)):
current_hour = df["time"].iloc[i].hour current_hour = df["time"].iloc[i].hour
if(df["time"].iloc[i] - df["time"].iloc[indices[-1]] > datetime.timedelta(seconds = poll_rate.iloc[current_hour])): if df["time"].iloc[i] - df["time"].iloc[indices[-1]] > datetime.timedelta(seconds=poll_rate.iloc[current_hour]):
indices.append(i) indices.append(i)
return df.iloc[indices] return df.iloc[indices]