diff --git a/analyze.py b/analyze.py index 31cb08d..5bd34ee 100644 --- a/analyze.py +++ b/analyze.py @@ -2,6 +2,21 @@ import pandas as pd import matplotlib.pyplot as plt def error(df, df_original, column_name): + """ + Calculate the error between the values in a column of a DataFrame and the last value before each timestamp. + + Args: + df (pandas.DataFrame): The DataFrame containing the values. + df_original (pandas.DataFrame): The original DataFrame containing the timestamps and values. + column_name (str): The name of the column to calculate the error for. + + Returns: + list: A list of absolute differences between the values in the specified column and the last value before each timestamp. + + Raises: + ValueError: If the specified column does not exist in the DataFrame. + """ + diff = [] # Check if the column exists in the DataFrame if column_name not in df.columns: @@ -23,6 +38,19 @@ def error(df, df_original, column_name): def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differences"): + """ + Plots a histogram of the given data series. + + Parameters: + - data_series (array-like): The data series to plot the histogram for. + - bins (int): The number of bins to use for the histogram. Default is 10. + - title (str): The title of the histogram plot. Default is "Distribution of Absolute Differences". + + Returns: + None + """ + import matplotlib.pyplot as plt + plt.figure(figsize=(8, 4)) # Set the figure size for better readability plt.hist(data_series, bins=bins, color='blue', alpha=0.7, edgecolor='black') plt.title(title) @@ -32,15 +60,40 @@ def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differe plt.show() def compute_efficiency(df): - #compute the time differnece between the first and last point + """ + Compute the efficiency of a data frame. i.e the time taken to collect each data point. + + Parameters: + df (pandas.DataFrame): The input data frame. + + Returns: + float: The efficiency value. + + """ + # compute the time difference between the first and last point time_diff = df["time"].iloc[-1] - df["time"].iloc[0] - #compute the number of points + # compute the number of points num_points = len(df) - #compute the efficiency + # compute the efficiency efficiency = time_diff.total_seconds() / num_points return efficiency def hourly_rate_of_change(df): + """ + Calculate the average absolute rate of change per hour for a given DataFrame. + + Args: + df (pandas.DataFrame): The DataFrame containing the data. + + Returns: + pandas.Series: A Series containing the average absolute rate of change per hour. + + Raises: + ValueError: If the DataFrame does not include 'time' and 'value' columns, or if it is empty. + ValueError: If the 'time' column is not of datetime type. + + """ + # Check if required columns exist if 'time' not in df.columns or 'value' not in df.columns: raise ValueError("DataFrame must include 'time' and 'value' columns.") diff --git a/generate_data.py b/generate_data.py index e49abb0..6afd1b4 100644 --- a/generate_data.py +++ b/generate_data.py @@ -4,6 +4,19 @@ from opensimplex import OpenSimplex import datetime def generate_greenhouse_data(filepath): + """ + Generate filtered greenhouse data from a CSV file. + + Parameters: + filepath (str): The path to the CSV file. + + Returns: + pandas.DataFrame: The filtered greenhouse data. + """ + + # Rest of the code... +def generate_greenhouse_data(filepath): + # Read the CSV file into a DataFrame, parsing 'time' as datetime df = pd.read_csv(filepath, parse_dates=["time"], dtype={"id": str, "value": float}) @@ -25,6 +38,21 @@ def generate_greenhouse_data(filepath): def generate_simplex(start_time=None, end_time=None, interval=600, max_temp=30, min_temp=10, frequency=10): + """ + Generate a DataFrame with time and temperature values using Simplex noise. + + Parameters: + - start_time (datetime): The start time for generating the data. If not provided, it defaults to 1 day before the end_time. + - end_time (datetime): The end time for generating the data. If not provided, it defaults to the current time. + - interval (int): The time interval in seconds between each data point. Defaults to 600 seconds (10 minutes). + - max_temp (float): The maximum temperature value. Defaults to 30. + - min_temp (float): The minimum temperature value. Defaults to 10. + - frequency (int): The frequency parameter for the Simplex noise generator. Defaults to 10. + + Returns: + - df (DataFrame): A pandas DataFrame with 'time' and 'value' columns representing the generated time and temperature values. + """ + # Default time settings if none provided if end_time is None: end_time = datetime.datetime.now() diff --git a/main.py b/main.py index bca2c42..da0b25a 100644 --- a/main.py +++ b/main.py @@ -4,11 +4,37 @@ from generate_data import * from analyze import * from poll import * -# sort two lists based on the first list -def sort(X,Y): - return zip(*sorted(zip(X,Y))) +def sort(X, Y): + """ + Sorts two lists X and Y in ascending order based on the values in X. + + Args: + X (list): The first list to be sorted. + Y (list): The second list to be sorted. + + Returns: + tuple: A tuple containing the sorted X and Y lists. + + Example: + X = [3, 1, 2] + Y = ['c', 'a', 'b'] + sorted_X, sorted_Y = sort(X, Y) + # sorted_X: [1, 2, 3] + # sorted_Y: ['a', 'b', 'c'] + """ + return zip(*sorted(zip(X, Y))) def plot_temperature_data(df, recent_count=None): + """ + Plots the temperature data from a DataFrame. + + Args: + df (pandas.DataFrame): The DataFrame containing the temperature data. + recent_count (int, optional): The number of recent data points to plot. If specified, only the last 'recent_count' rows will be plotted. Defaults to None. + + Returns: + None + """ plt.figure(figsize=(5, 5)) # Check if recent_count is specified and valid @@ -25,6 +51,20 @@ def plot_temperature_data(df, recent_count=None): plt.show() def test_sample_every_kth_point(df): + """ + Test the sample_every_kth_point function with different values of k. + + Parameters: + - df: The input DataFrame containing the data. + + Returns: + - X: The array of values used for sampling. + - EFFICIENCY: The efficiency values for each sampling. + - MEAN: The mean error values for each sampling. + - MEDIAN: The median error values for each sampling. + - STD: The standard deviation of error values for each sampling. + """ + X = np.arange(1, 10, 1) MEAN = [] STD = [] @@ -46,24 +86,59 @@ def test_sample_every_kth_point(df): def example_sample_every_kth_point(k=10): + """ + Example function that demonstrates how to sample every kth point from a dataframe and plot the temperature data. + + Parameters: + k (int): The sampling interval. Default is 10. + + Returns: + None + """ df = generate_greenhouse_data("datasets/greenhouse.csv") df = df.tail(150) df = sample_every_kth_point(df, k) plot_temperature_data(df) def example_sample_reglin(): + """ + This function demonstrates the usage of the sample_reglin function. + It generates greenhouse data, selects the last 150 rows, applies the sample_reglin function, + and plots the temperature data. + """ df = generate_greenhouse_data("datasets/greenhouse.csv") df = df.tail(150) df = sample_reglin(df) plot_temperature_data(df) -def exaample_optimal_sample(dT = 0.3): +def example_optimal_sample(dT = 0.3): + """ + Example function that demonstrates the usage of the optimal_sample function. + + Parameters: + dT (float): The threshold value for temperature difference. Default is 0.3. + + Returns: + None + """ df = generate_greenhouse_data("datasets/greenhouse.csv") df = df.tail(150) df = optimal_sample(df, threshold_dT=dT) plot_temperature_data(df) def example_sample_avg_rate_of_change(): + """ + This function demonstrates how to calculate the sample average rate of change for temperature data. + It generates greenhouse data, calculates the hourly rate of change, selects the last 150 records, + and then calculates the sample average rate of change based on the hourly rate of change. + Finally, it plots the temperature data. + + Parameters: + None + + Returns: + None + """ df = generate_greenhouse_data("datasets/greenhouse.csv") hroc = hourly_rate_of_change(df) df = df.tail(150) @@ -71,6 +146,25 @@ def example_sample_avg_rate_of_change(): plot_temperature_data(df) def test_sample_reglin(df): + """ + Perform a test on the sample_reglin function with different values of max_dT. + + Parameters: + - df: DataFrame + The input DataFrame containing temperature data. + + Returns: + - X: ndarray + An array of values ranging from 0.4 to 3 with a step of 0.05. + - EFFICIENCY: list + A list of efficiency values calculated for each max_dT value. + - MEAN: list + A list of mean error values calculated for each max_dT value. + - MEDIAN: list + A list of median error values calculated for each max_dT value. + - STD: list + A list of standard deviation error values calculated for each max_dT value. + """ X = np.arange(0.4, 3, 0.05) MEAN = [] STD = [] @@ -91,6 +185,20 @@ def test_sample_reglin(df): def test_optimal_sample(df): + """ + Test the optimal sample function with different threshold values. + + Args: + df (pandas.DataFrame): The input DataFrame containing temperature data. + + Returns: + tuple: A tuple containing the following lists: + - X (numpy.ndarray): An array of threshold values. + - EFFICIENCY (list): A list of efficiency values for each threshold. + - MEAN (list): A list of mean error values for each threshold. + - MEDIAN (list): A list of median error values for each threshold. + - STD (list): A list of standard deviation error values for each threshold. + """ X = np.arange(0.1, 3, 0.05) MEAN = [] STD = [] @@ -109,7 +217,24 @@ def test_optimal_sample(df): EFFICIENCY.append(compute_efficiency(df_sampeld)) return X, EFFICIENCY, MEAN, MEDIAN, STD -def test_sample_avg_rate_of_change(df,hourly_rate_of_change): +def test_sample_avg_rate_of_change(df, hourly_rate_of_change): + """ + Test the sample average rate of change. + + This function takes a DataFrame `df` and the `hourly_rate_of_change` as input. + It performs a series of calculations on the data and returns the results. + + Parameters: + - df (pandas.DataFrame): The input DataFrame containing the data. + - hourly_rate_of_change (float): The hourly rate of change. + + Returns: + - X (numpy.ndarray): An array of values ranging from 0.01 to 3 with a step of 0.05. + - EFFICIENCY (list): A list of efficiency values calculated for each sample. + - MEAN (list): A list of mean values calculated for each sample. + - MEDIAN (list): A list of median values calculated for each sample. + - STD (list): A list of standard deviation values calculated for each sample. + """ X = np.arange(0.01, 3, 0.05) MEAN = [] STD = [] @@ -128,35 +253,56 @@ def test_sample_avg_rate_of_change(df,hourly_rate_of_change): EFFICIENCY.append(compute_efficiency(df_sampled)) return X, EFFICIENCY, MEAN, MEDIAN, STD -def comparaison_mean(df,limit=1000): +def comparaison_mean(df, limit=1000): + """ + Compare different sampling methods based on their mean and efficiency. + + Parameters: + - df: DataFrame + The input DataFrame containing the data. + - limit: int, optional + The number of rows to consider from the end of the DataFrame. Default is 1000. + + Returns: + None + """ plt.figure(figsize=(10, 5)) hroc = hourly_rate_of_change(df) df = df.tail(limit) X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_every_kth_point(df) MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY) - plt.plot( MEAN,EFFICIENCY, label="Constant Polling Interval", marker='x') + plt.plot(MEAN, EFFICIENCY, label="Constant Polling Interval", marker='x') X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_reglin(df) MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY) - plt.plot( MEAN,EFFICIENCY, label="Linear Regression", marker='x') + plt.plot(MEAN, EFFICIENCY, label="Linear Regression", marker='x') X, EFFICIENCY, MEAN, MEDIAN, STD = test_optimal_sample(df) MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY) - plt.plot( MEAN,EFFICIENCY, label="Optimal Polling rate", marker='x') + plt.plot(MEAN, EFFICIENCY, label="Optimal Polling rate", marker='x') - X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_avg_rate_of_change(df,hroc) + X, EFFICIENCY, MEAN, MEDIAN, STD = test_sample_avg_rate_of_change(df, hroc) MEAN, EFFICIENCY = sort(MEAN, EFFICIENCY) - plt.plot( MEAN,EFFICIENCY, label="Hourly Rate of Change", marker='x') + plt.plot(MEAN, EFFICIENCY, label="Hourly Rate of Change", marker='x') plt.ylabel("Average seconds between polls") plt.xlabel("Average error") plt.ylim(0, 8000) - plt.xlim(0,1.3) + plt.xlim(0, 1.3) plt.legend() plt.show() def example_optimal_sample(dT = 0.3): + """ + This function demonstrates how to use the `optimal_sample` function to generate an optimal sample of greenhouse data. + + Parameters: + dT (float): The threshold value for temperature difference. Default is 0.3. + + Returns: + None + """ df = generate_greenhouse_data("datasets/greenhouse.csv") df = df.tail(1000) df = optimal_sample(df, threshold_dT=dT) @@ -169,6 +315,15 @@ def example_optimal_sample(dT = 0.3): plt.show() def histogram_sample_every_kth_point(k=10): + """ + Generate a histogram of the differences between the original data and the sampled data. + + Parameters: + - k (int): The sampling interval. Only every kth point will be included in the sampled data. + + Returns: + None + """ df = generate_greenhouse_data("datasets/greenhouse.csv") df = df.tail(1000) df_sampled = sample_every_kth_point(df, k) @@ -192,9 +347,9 @@ def histogram_sample_every_kth_point(k=10): # comparaison_mean(df) # Temperature rate of change over the day -df = generate_greenhouse_data("datasets/greenhouse.csv") -hcor = hourly_rate_of_change(df) -print(hcor) +# df = generate_greenhouse_data("datasets/greenhouse.csv") +# hcor = hourly_rate_of_change(df) +# print(hcor) # hcor.plot() # plt.xlabel("Hour of the day") # plt.ylabel("Average absolute rate of change (°C/hour)") diff --git a/poll.py b/poll.py index 7dd61ff..2e77e0f 100644 --- a/poll.py +++ b/poll.py @@ -2,6 +2,22 @@ import datetime from analyze import hourly_rate_of_change def sample_every_kth_point(df, k): + """ + Sample every k-th point from a DataFrame. + + Parameters: + - df: pandas DataFrame + The DataFrame from which to sample the points. + - k: int + The interval between sampled points. + + Returns: + - sampled_df: pandas DataFrame + The DataFrame containing the sampled points. + Raises: + - ValueError: If k is not a positive integer or if k exceeds the number of rows in the DataFrame. + """ + # Validate the input to ensure k is positive and does not exceed the DataFrame length if k <= 0: raise ValueError("k must be a positive integer.") @@ -13,6 +29,17 @@ def sample_every_kth_point(df, k): return sampled_df def optimal_sample(df, threshold_dT=0.5): + """ + Returns a subset of the input DataFrame `df` containing rows that have a significant change in value. + + Parameters: + df (pandas.DataFrame): The input DataFrame. + threshold_dT (float, optional): The threshold value for the change in value. Defaults to 0.5. + + Returns: + pandas.DataFrame: A subset of the input DataFrame `df` containing rows with significant changes in value. + """ + t0 = df["time"].iloc[0] indices = [0] times = [t0] @@ -23,22 +50,45 @@ def optimal_sample(df, threshold_dT=0.5): indices.append(i) return df.iloc[indices] -def sample_reglin(df,max_dT=0.5, max_poll_interval=2 * 3600): +def sample_reglin(df, max_dT=0.5, max_poll_interval=2 * 3600): + """ + Returns a subset of the input DataFrame `df` by sampling points based on a linear regression algorithm. + + Parameters: + - df (pandas.DataFrame): The input DataFrame containing the time series data. + - max_dT (float): The value difference that should be considered significant enough to add a new value. + Defaults to 0.5. + - max_poll_interval (int): The maximum time interval allowed between the first and last point in the subset. + Defaults to 2 hours (2 * 3600 seconds). + + Returns: + - pandas.DataFrame: A subset of the input DataFrame `df` containing the sampled points. + + Raises: + - ValueError: If there is no point before the specified date. + + """ indices = [] + def get_first_point_after(date): - if(df[df['time'] > date].empty): + if df[df['time'] > date].empty: raise ValueError("No point before the date") return df[df['time'] > date].iloc[0] + # Get first two points t0 = df["time"].iloc[0] t1 = df["time"].iloc[1] + while True: v0 = df[df["time"] == t0]["value"].values[0] v1 = df[df["time"] == t1]["value"].values[0] + # Calculate the slope s = abs((v1 - v0) / (t1 - t0).total_seconds()) - #add max_dT/s to t1 - new_t = t1 + datetime.timedelta(seconds=min(max_dT/s, max_poll_interval)) + + # Add max_dT/s to t1 + new_t = t1 + datetime.timedelta(seconds=min(max_dT / s, max_poll_interval)) + try: new_t = get_first_point_after(new_t)["time"] indices.append(df[df["time"] == new_t].index[0]) @@ -46,12 +96,24 @@ def sample_reglin(df,max_dT=0.5, max_poll_interval=2 * 3600): t1 = new_t except ValueError: break + return df.loc[indices] -def sample_avg_rate_of_change(df,poll_rate): +def sample_avg_rate_of_change(df, poll_rate): + """ + Calculate the sample average rate of change for a given DataFrame. + + Args: + df (pandas.DataFrame): The DataFrame containing the data. + poll_rate (pandas.Series): The Series containing the poll rates for each hour. + + Returns: + pandas.DataFrame: The subset of the DataFrame with the indices where the rate of change exceeds the poll rate. + + """ indices = [0] for i in range(len(df)): current_hour = df["time"].iloc[i].hour - if(df["time"].iloc[i] - df["time"].iloc[indices[-1]] > datetime.timedelta(seconds = poll_rate.iloc[current_hour])): + if df["time"].iloc[i] - df["time"].iloc[indices[-1]] > datetime.timedelta(seconds=poll_rate.iloc[current_hour]): indices.append(i) return df.iloc[indices]