added documentation

2026-04-10 19:50:20 +02:00 · 2024-05-07 00:36:41 +02:00
parent ac4d95bd07
commit 3fd37213e1
4 changed files with 322 additions and 24 deletions
--- a/analyze.py
+++ b/analyze.py
@@ -2,6 +2,21 @@ import pandas as pd
 import matplotlib.pyplot as plt

 def error(df, df_original, column_name):
+    """
+    Calculate the error between the values in a column of a DataFrame and the last value before each timestamp.
+
+    Args:
+        df (pandas.DataFrame): The DataFrame containing the values.
+        df_original (pandas.DataFrame): The original DataFrame containing the timestamps and values.
+        column_name (str): The name of the column to calculate the error for.
+
+    Returns:
+        list: A list of absolute differences between the values in the specified column and the last value before each timestamp.
+
+    Raises:
+        ValueError: If the specified column does not exist in the DataFrame.
+    """
+    
    diff = []
    # Check if the column exists in the DataFrame
    if column_name not in df.columns:
@@ -23,6 +38,19 @@ def error(df, df_original, column_name):


 def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differences"):
+    """
+    Plots a histogram of the given data series.
+
+    Parameters:
+    - data_series (array-like): The data series to plot the histogram for.
+    - bins (int): The number of bins to use for the histogram. Default is 10.
+    - title (str): The title of the histogram plot. Default is "Distribution of Absolute Differences".
+
+    Returns:
+    None
+    """
+    import matplotlib.pyplot as plt
+
    plt.figure(figsize=(8, 4))  # Set the figure size for better readability
    plt.hist(data_series, bins=bins, color='blue', alpha=0.7, edgecolor='black')
    plt.title(title)
@@ -32,7 +60,17 @@ def plot_histogram(data_series, bins=10, title="Distribution of Absolute Differe
    plt.show()

 def compute_efficiency(df):
-    #compute the time differnece between the first and last point
+    """
+    Compute the efficiency of a data frame. i.e the time taken to collect each data point.
+
+    Parameters:
+    df (pandas.DataFrame): The input data frame.
+
+    Returns:
+    float: The efficiency value.
+
+    """
+    # compute the time difference between the first and last point
    time_diff = df["time"].iloc[-1] - df["time"].iloc[0]
    # compute the number of points
    num_points = len(df)
@@ -41,6 +79,21 @@ def compute_efficiency(df):
    return efficiency

 def hourly_rate_of_change(df):
+    """
+    Calculate the average absolute rate of change per hour for a given DataFrame.
+
+    Args:
+        df (pandas.DataFrame): The DataFrame containing the data.
+
+    Returns:
+        pandas.Series: A Series containing the average absolute rate of change per hour.
+
+    Raises:
+        ValueError: If the DataFrame does not include 'time' and 'value' columns, or if it is empty.
+        ValueError: If the 'time' column is not of datetime type.
+
+    """
+    
    # Check if required columns exist
    if 'time' not in df.columns or 'value' not in df.columns:
        raise ValueError("DataFrame must include 'time' and 'value' columns.")
--- a/generate_data.py
+++ b/generate_data.py
@@ -4,6 +4,19 @@ from opensimplex import OpenSimplex
 import datetime

 def generate_greenhouse_data(filepath):
+    """
+    Generate filtered greenhouse data from a CSV file.
+    
+    Parameters:
+    filepath (str): The path to the CSV file.
+    
+    Returns:
+    pandas.DataFrame: The filtered greenhouse data.
+    """
+    
+    # Rest of the code...
+def generate_greenhouse_data(filepath):
+
    # Read the CSV file into a DataFrame, parsing 'time' as datetime
    df = pd.read_csv(filepath, parse_dates=["time"], dtype={"id": str, "value": float})
    
@@ -25,6 +38,21 @@ def generate_greenhouse_data(filepath):


 def generate_simplex(start_time=None, end_time=None, interval=600, max_temp=30, min_temp=10, frequency=10):
+    """
+    Generate a DataFrame with time and temperature values using Simplex noise.
+
+    Parameters:
+    - start_time (datetime): The start time for generating the data. If not provided, it defaults to 1 day before the end_time.
+    - end_time (datetime): The end time for generating the data. If not provided, it defaults to the current time.
+    - interval (int): The time interval in seconds between each data point. Defaults to 600 seconds (10 minutes).
+    - max_temp (float): The maximum temperature value. Defaults to 30.
+    - min_temp (float): The minimum temperature value. Defaults to 10.
+    - frequency (int): The frequency parameter for the Simplex noise generator. Defaults to 10.
+
+    Returns:
+    - df (DataFrame): A pandas DataFrame with 'time' and 'value' columns representing the generated time and temperature values.
+    """
+    
    # Default time settings if none provided
    if end_time is None:
        end_time = datetime.datetime.now()
--- a/main.py
+++ b/main.py
@@ -4,11 +4,37 @@ from generate_data import *
 from analyze import *
 from poll import *

-# sort two lists based on the first list
 def sort(X, Y):
+    """
+    Sorts two lists X and Y in ascending order based on the values in X.
+
+    Args:
+        X (list): The first list to be sorted.
+        Y (list): The second list to be sorted.
+
+    Returns:
+        tuple: A tuple containing the sorted X and Y lists.
+
+    Example:
+        X = [3, 1, 2]
+        Y = ['c', 'a', 'b']
+        sorted_X, sorted_Y = sort(X, Y)
+        # sorted_X: [1, 2, 3]
+        # sorted_Y: ['a', 'b', 'c']
+    """
    return zip(*sorted(zip(X, Y)))

 def plot_temperature_data(df, recent_count=None):
+    """
+    Plots the temperature data from a DataFrame.
+
+    Args:
+        df (pandas.DataFrame): The DataFrame containing the temperature data.
+        recent_count (int, optional): The number of recent data points to plot. If specified, only the last 'recent_count' rows will be plotted. Defaults to None.
+
+    Returns:
+        None
+    """
    plt.figure(figsize=(5, 5))
    
    # Check if recent_count is specified and valid
@@ -25,6 +51,20 @@ def plot_temperature_data(df, recent_count=None):
    plt.show()

 def test_sample_every_kth_point(df):
+    """
+    Test the sample_every_kth_point function with different values of k.
+
+    Parameters:
+    - df: The input DataFrame containing the data.
+
+    Returns:
+    - X: The array of values used for sampling.
+    - EFFICIENCY: The efficiency values for each sampling.
+    - MEAN: The mean error values for each sampling.
+    - MEDIAN: The median error values for each sampling.
+    - STD: The standard deviation of error values for each sampling.
+    """
+    
    X = np.arange(1, 10, 1)
    MEAN = []
    STD = []
@@ -46,24 +86,59 @@ def test_sample_every_kth_point(df):


 def example_sample_every_kth_point(k=10):
+    """
+    Example function that demonstrates how to sample every kth point from a dataframe and plot the temperature data.
+
+    Parameters:
+    k (int): The sampling interval. Default is 10.
+
+    Returns:
+    None
+    """
    df  = generate_greenhouse_data("datasets/greenhouse.csv")
    df = df.tail(150)
    df = sample_every_kth_point(df, k)
    plot_temperature_data(df)

 def example_sample_reglin():
+    """
+    This function demonstrates the usage of the sample_reglin function.
+    It generates greenhouse data, selects the last 150 rows, applies the sample_reglin function,
+    and plots the temperature data.
+    """
    df  = generate_greenhouse_data("datasets/greenhouse.csv")
    df = df.tail(150)
    df = sample_reglin(df)
    plot_temperature_data(df)

-def exaample_optimal_sample(dT = 0.3):
+def example_optimal_sample(dT = 0.3):
+    """
+    Example function that demonstrates the usage of the optimal_sample function.
+    
+    Parameters:
+        dT (float): The threshold value for temperature difference. Default is 0.3.
+    
+    Returns:
+        None
+    """
    df  = generate_greenhouse_data("datasets/greenhouse.csv")
    df = df.tail(150)
    df = optimal_sample(df, threshold_dT=dT)
    plot_temperature_data(df)

 def example_sample_avg_rate_of_change():
+    """
+    This function demonstrates how to calculate the sample average rate of change for temperature data.
+    It generates greenhouse data, calculates the hourly rate of change, selects the last 150 records,
+    and then calculates the sample average rate of change based on the hourly rate of change.
+    Finally, it plots the temperature data.
+
+    Parameters:
+    None
+
+    Returns:
+    None
+    """
    df  = generate_greenhouse_data("datasets/greenhouse.csv")
    hroc = hourly_rate_of_change(df)
    df = df.tail(150)
@@ -71,6 +146,25 @@ def example_sample_avg_rate_of_change():
    plot_temperature_data(df)

 def test_sample_reglin(df):
+    """
+    Perform a test on the sample_reglin function with different values of max_dT.
+
+    Parameters:
+    - df: DataFrame
+        The input DataFrame containing temperature data.
+
+    Returns:
+    - X: ndarray
+        An array of values ranging from 0.4 to 3 with a step of 0.05.
+    - EFFICIENCY: list
+        A list of efficiency values calculated for each max_dT value.
+    - MEAN: list
+        A list of mean error values calculated for each max_dT value.
+    - MEDIAN: list
+        A list of median error values calculated for each max_dT value.
+    - STD: list
+        A list of standard deviation error values calculated for each max_dT value.
+    """
    X = np.arange(0.4, 3, 0.05)
    MEAN = []
    STD = []
@@ -91,6 +185,20 @@ def test_sample_reglin(df):


 def test_optimal_sample(df):
+    """
+    Test the optimal sample function with different threshold values.
+
+    Args:
+        df (pandas.DataFrame): The input DataFrame containing temperature data.
+
+    Returns:
+        tuple: A tuple containing the following lists:
+            - X (numpy.ndarray): An array of threshold values.
+            - EFFICIENCY (list): A list of efficiency values for each threshold.
+            - MEAN (list): A list of mean error values for each threshold.
+            - MEDIAN (list): A list of median error values for each threshold.
+            - STD (list): A list of standard deviation error values for each threshold.
+    """
    X = np.arange(0.1, 3, 0.05)
    MEAN = []
    STD = []
@@ -110,6 +218,23 @@ def test_optimal_sample(df):
    return X, EFFICIENCY, MEAN, MEDIAN, STD

 def test_sample_avg_rate_of_change(df, hourly_rate_of_change):
+    """
+    Test the sample average rate of change.
+
+    This function takes a DataFrame `df` and the `hourly_rate_of_change` as input.
+    It performs a series of calculations on the data and returns the results.
+
+    Parameters:
+    - df (pandas.DataFrame): The input DataFrame containing the data.
+    - hourly_rate_of_change (float): The hourly rate of change.
+
+    Returns:
+    - X (numpy.ndarray): An array of values ranging from 0.01 to 3 with a step of 0.05.
+    - EFFICIENCY (list): A list of efficiency values calculated for each sample.
+    - MEAN (list): A list of mean values calculated for each sample.
+    - MEDIAN (list): A list of median values calculated for each sample.
+    - STD (list): A list of standard deviation values calculated for each sample.
+    """
    X = np.arange(0.01, 3, 0.05)
    MEAN = []
    STD = []
@@ -129,6 +254,18 @@ def test_sample_avg_rate_of_change(df,hourly_rate_of_change):
    return X, EFFICIENCY, MEAN, MEDIAN, STD

 def comparaison_mean(df, limit=1000):
+    """
+    Compare different sampling methods based on their mean and efficiency.
+
+    Parameters:
+    - df: DataFrame
+        The input DataFrame containing the data.
+    - limit: int, optional
+        The number of rows to consider from the end of the DataFrame. Default is 1000.
+
+    Returns:
+    None
+    """
    plt.figure(figsize=(10, 5))
    hroc = hourly_rate_of_change(df)
    df = df.tail(limit)
@@ -157,6 +294,15 @@ def comparaison_mean(df,limit=1000):
    plt.show()

 def example_optimal_sample(dT = 0.3):
+    """
+    This function demonstrates how to use the `optimal_sample` function to generate an optimal sample of greenhouse data.
+    
+    Parameters:
+        dT (float): The threshold value for temperature difference. Default is 0.3.
+    
+    Returns:
+        None
+    """
    df  = generate_greenhouse_data("datasets/greenhouse.csv")
    df = df.tail(1000)
    df = optimal_sample(df, threshold_dT=dT)
@@ -169,6 +315,15 @@ def example_optimal_sample(dT = 0.3):
    plt.show()

 def histogram_sample_every_kth_point(k=10):
+    """
+    Generate a histogram of the differences between the original data and the sampled data.
+    
+    Parameters:
+    - k (int): The sampling interval. Only every kth point will be included in the sampled data.
+    
+    Returns:
+    None
+    """
    df  = generate_greenhouse_data("datasets/greenhouse.csv")
    df = df.tail(1000)
    df_sampled = sample_every_kth_point(df, k)
@@ -192,9 +347,9 @@ def histogram_sample_every_kth_point(k=10):
 # comparaison_mean(df)

 # Temperature rate of change over the day
-df = generate_greenhouse_data("datasets/greenhouse.csv")
-hcor = hourly_rate_of_change(df)
-print(hcor)
+# df = generate_greenhouse_data("datasets/greenhouse.csv")
+# hcor = hourly_rate_of_change(df)
+# print(hcor)
 # hcor.plot()
 # plt.xlabel("Hour of the day")
 # plt.ylabel("Average absolute rate of change (°C/hour)")
--- a/poll.py
+++ b/poll.py
@@ -2,6 +2,22 @@ import datetime
 from analyze import hourly_rate_of_change

 def sample_every_kth_point(df, k):
+    """
+    Sample every k-th point from a DataFrame.
+
+    Parameters:
+    - df: pandas DataFrame
+        The DataFrame from which to sample the points.
+    - k: int
+        The interval between sampled points.
+
+    Returns:
+    - sampled_df: pandas DataFrame
+        The DataFrame containing the sampled points.
+    Raises:
+    - ValueError: If k is not a positive integer or if k exceeds the number of rows in the DataFrame.
+    """
+
    # Validate the input to ensure k is positive and does not exceed the DataFrame length
    if k <= 0:
        raise ValueError("k must be a positive integer.")
@@ -13,6 +29,17 @@ def sample_every_kth_point(df, k):
    return sampled_df

 def optimal_sample(df, threshold_dT=0.5):
+    """
+    Returns a subset of the input DataFrame `df` containing rows that have a significant change in value.
+
+    Parameters:
+        df (pandas.DataFrame): The input DataFrame.
+        threshold_dT (float, optional): The threshold value for the change in value. Defaults to 0.5.
+
+    Returns:
+        pandas.DataFrame: A subset of the input DataFrame `df` containing rows with significant changes in value.
+    """
+    
    t0 = df["time"].iloc[0]
    indices = [0]
    times = [t0]
@@ -24,21 +51,44 @@ def optimal_sample(df, threshold_dT=0.5):
    return df.iloc[indices]
        
 def sample_reglin(df, max_dT=0.5, max_poll_interval=2 * 3600):
+    """
+    Returns a subset of the input DataFrame `df` by sampling points based on a linear regression algorithm.
+
+    Parameters:
+    - df (pandas.DataFrame): The input DataFrame containing the time series data.
+    - max_dT (float): The value difference that should be considered significant enough to add a new value.
+                      Defaults to 0.5.
+    - max_poll_interval (int): The maximum time interval allowed between the first and last point in the subset.
+                               Defaults to 2 hours (2 * 3600 seconds).
+
+    Returns:
+    - pandas.DataFrame: A subset of the input DataFrame `df` containing the sampled points.
+
+    Raises:
+    - ValueError: If there is no point before the specified date.
+
+    """
    indices = []
+
    def get_first_point_after(date):
-        if(df[df['time'] > date].empty):
+        if df[df['time'] > date].empty:
            raise ValueError("No point before the date")
        return df[df['time'] > date].iloc[0]
+
    # Get first two points
    t0 = df["time"].iloc[0]
    t1 = df["time"].iloc[1]
+
    while True:
        v0 = df[df["time"] == t0]["value"].values[0]
        v1 = df[df["time"] == t1]["value"].values[0]
+
        # Calculate the slope
        s = abs((v1 - v0) / (t1 - t0).total_seconds())
-        #add max_dT/s to t1
+
+        # Add max_dT/s to t1
        new_t = t1 + datetime.timedelta(seconds=min(max_dT / s, max_poll_interval))
+
        try:
            new_t = get_first_point_after(new_t)["time"]
            indices.append(df[df["time"] == new_t].index[0])
@@ -46,12 +96,24 @@ def sample_reglin(df,max_dT=0.5, max_poll_interval=2 * 3600):
            t1 = new_t
        except ValueError:
            break
+
    return df.loc[indices]

 def sample_avg_rate_of_change(df, poll_rate):
+    """
+    Calculate the sample average rate of change for a given DataFrame.
+
+    Args:
+        df (pandas.DataFrame): The DataFrame containing the data.
+        poll_rate (pandas.Series): The Series containing the poll rates for each hour.
+
+    Returns:
+        pandas.DataFrame: The subset of the DataFrame with the indices where the rate of change exceeds the poll rate.
+
+    """
    indices = [0]
    for i in range(len(df)):
        current_hour = df["time"].iloc[i].hour
-        if(df["time"].iloc[i] - df["time"].iloc[indices[-1]] > datetime.timedelta(seconds = poll_rate.iloc[current_hour])):
+        if df["time"].iloc[i] - df["time"].iloc[indices[-1]] > datetime.timedelta(seconds=poll_rate.iloc[current_hour]):
            indices.append(i)
    return df.iloc[indices]